############################## General ##############################
import os
import itertools
import math
import ast
import time
from tqdm import tqdm
############################## Data Handling ##############################
import pandas as pd
pd.set_option('display.max_columns', None)
import numpy as np
from collections import defaultdict, Counter
from scipy import stats
import statsmodels.api as sm
from statsmodels.formula.api import ols
############################## SKLearn ##############################
from sklearn.cluster import KMeans
from sklearn.model_selection import train_test_split, cross_val_score, KFold, GridSearchCV
from sklearn.preprocessing import LabelEncoder, StandardScaler, MinMaxScaler
from sklearn.decomposition import PCA, TruncatedSVD
from sklearn.linear_model import LogisticRegression, LinearRegression, Lasso, Ridge, ElasticNet
from sklearn.ensemble import RandomForestRegressor, RandomForestClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.pipeline import Pipeline
from sklearn.metrics import classification_report, confusion_matrix, mean_squared_error, r2_score, accuracy_score
from sklearn.metrics import mean_squared_error as mse
from sklearn.metrics.pairwise import cosine_similarity
from sklearn import linear_model
from sklearn import metrics
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
from sklearn.manifold import TSNE
############################## Data Visualization ##############################
import matplotlib.pyplot as plt
import seaborn as sns
sns.set_style("darkgrid")
sns.set(rc={'figure.figsize':(15,8)})
from prettytable import PrettyTable
import eli5
from eli5 import show_weights, show_prediction
from eli5.sklearn import PermutationImportance
from yellowbrick.regressor import ResidualsPlot
############################## Network Analysis ##############################
import networkx as nx
from pyvis.network import Network
from nxviz import CircosPlot
from nxviz import ArcPlot
from community import community_louvain
import holoviews as hv
from holoviews import opts
hv.extension('bokeh')
from bokeh.plotting import show
############################## Machine Learning ##############################
from keras.models import Sequential
from keras.layers import Dense
from keras.wrappers.scikit_learn import KerasRegressor
from keras.utils import np_utils
############################## NLP ##############################
import re
import nltk
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
import gensim
from gensim.models import Phrases, LdaModel, CoherenceModel
from gensim.corpora.dictionary import Dictionary
import pyLDAvis.gensim
from langdetect import detect, DetectorFactory
DetectorFactory.seed = 0
############################## Oversampling ##############################
from imblearn.over_sampling import RandomOverSampler
import warnings
warnings.filterwarnings("ignore", category=DeprecationWarning)
DATA_DIRECTORY = os.getcwd() + "/../Data"
PATH_LYRICS = DATA_DIRECTORY + "/df_lyrics.csv"
PATH_LYRICS_CLEAN = DATA_DIRECTORY + "/df_lyrics_clean.csv"
PATH_ARTISTS = DATA_DIRECTORY + "/artists_with_metadata.csv"
PATH_ARTISTS_CLEAN = DATA_DIRECTORY + "/artists_with_metadata_clean.csv"
PATH_ALBUMS = DATA_DIRECTORY + "/albums_with_metadata.csv"
PATH_TRACKS = DATA_DIRECTORY + "/df_tracks.csv"
PATH_TRACKS_CLEAN = DATA_DIRECTORY + "/df_tracks_clean.csv"
PATH_NODES = DATA_DIRECTORY + "/df_nodes.csv"
PATH_EDGES = DATA_DIRECTORY + "/df_edgelist.csv"
RANDOM = 42
# Load tracks data
df_tracks = pd.read_csv(PATH_TRACKS, index_col=0,
converters = {"Artist Names": ast.literal_eval,
"Genres": ast.literal_eval})
# Load artists data and drop columns that were not fetched through Spotify
df_artists = pd.read_csv(PATH_ARTISTS, index_col=0, converters = {"Genres": ast.literal_eval})
df_artists = df_artists.drop(['Streams', 'Tracks', '1b+', '100m+', '10m+', '1m+','Last Update', 'href'], axis = 1)
# Load lyrics data
df_lyrics = df_lyrics = pd.read_csv(PATH_LYRICS, index_col=0)
df_lyrics = df_lyrics.drop(['Title_with_featured'], axis = 1)
df_artists.head()
df_artists.info()
df_artists.describe()
# Count frequency of each genre
genre_count = defaultdict(int)
for genres in df_artists["Genres"]:
for genre in genres:
genre_count[genre] += 1
print(f"Number of genres appearing in dataset: {len(genre_count.keys())}")
# Sort genre dictionary by value
genre_count_sorted = sorted(genre_count.items(), key=lambda item: item[1], reverse=True)
# Print top 10 most frequent genres
genre_count_sorted[:10]
# Print bottom 10 least occuring genres
genre_count_sorted[-10:]
# Determine number of genres associated to each artist
genres_per_artist = [len(genres) for genres in df_artists["Genres"]]
min(genres_per_artist)
# Find all artists that don't have any genres
[i for i, x in enumerate(genres_per_artist) if x == 0]
df_artists.iloc[528,:]["Artist Name"]
max(genres_per_artist)
df_tracks.head()
df_tracks.info()
The tracks dataframe contains 445578 records in total. All but 56 of those tracks are fully populated. Those 56 tracks are missing the audio features.
df_tracks.describe()
df_tracks["Track Name"].value_counts()
# Calculate number of different track names in dataset
len(df_tracks["Track Name"].value_counts().keys())
# Calculate number of different track names that appear more than once in dataset
sum(df_tracks["Track Name"].value_counts() > 1)
# Calculate number of different track names that appear more than 50 times
sum(df_tracks["Track Name"].value_counts() > 50)
There are several potential duplicate songs in this dataset that have to be dealt with later.
# Most popular songs
df_tracks.sort_values(by="Popularity",ascending=False)[["Track Name","Artist Names","Popularity"]].head(10)
# Longest Songs
df_tracks.sort_values(by="Duration in ms",ascending=False)[["Track Name","Artist Names","Duration in ms"]].head()
# Most Acoustic Songs
df_tracks.sort_values(by="Acousticness",ascending=False)[["Track Name","Artist Names","Acousticness"]].head()
# Songs to dance to
df_tracks.sort_values(by="Danceability",ascending=False)[["Track Name","Artist Names","Danceability"]].head()
# High Energy Songs
df_tracks.sort_values(by="Energy",ascending=False)[["Track Name","Artist Names","Energy"]].head()
# Songs that put you in a good mood
df_tracks.sort_values(by="Valence",ascending=False)[["Track Name","Artist Names","Valence"]].head()
# Songs that put you in a bad mood
df_tracks.sort_values(by="Valence",ascending=True)[["Track Name","Artist Names","Valence"]].head()
# Very fast songs
df_tracks.sort_values(by="Tempo",ascending=False)[["Track Name","Artist Names","Tempo"]].head()
# Very slow songs
df_tracks.sort_values(by="Tempo",ascending=True)[["Track Name","Artist Names","Tempo"]].head()
df_tracks.columns
df_lyrics.head()
df_lyrics.info()
df_artists.explode("Genres")["Genres"].value_counts()
Assign each artist the first genre it matches.
def pick_single_genres(df_art):
"""
Picks the first matched genre of the genre frequency list and assigns the first match to the artist
"""
# Generates a new columns holding NaN values
df_art["Genre"] = np.nan
# List looped through holds items order by occurence in the list of artists in a descending order
for genre in list(pd.DataFrame(df_art.explode("Genres")["Genres"].value_counts()).index):
# takes all the rows in the df which do not have a value for the Genre column assigned yet
for index, elem_empty, elem in zip(df_art[df_art["Genre"].isnull()].index,
df_art[df_art["Genre"].isnull()]["Genre"],
df_art[df_art["Genre"].isnull()]["Genres"]):
# loops through the list of genres assigned to an artist and if matches with genre loop position
# it will assign the most popular genre the artist matches (since looped based on appearence and
# therefore matched element of row is sorted out since it does not hold a NaN value anymore)
for list_elem in elem:
if list_elem == genre:
df_art.loc[index, "Genre"] = list_elem
Apply function.
pick_single_genres(df_artists)
Check how many difference genres the dataset still holds and their distribution.
df_artists["Genre"].nunique()
pd.DataFrame(df_artists["Genre"].value_counts()).head(20)
Consolidate genres that are likely to belong together based on their names.
df_artists["Genre"] = df_artists["Genre"].replace({"dance pop": "pop dance",
"modern rock": "rock",
"contemporary country": "country",
"k-pop": "pop",
"trap latino": "latin",
"pop urbaine": "pop",
"rock en espanol": "rock",
"pop nacional": "pop",
"melodic rap": "rap",
"electropop": "pop",
"indie pop": "pop",
"australian pop": "pop",
"emo rap": "rap",
"swedish pop": "pop",
"europop": "pop",
"art pop": "pop",
"dutch pop": "pop",
"bow pop": "pop",
"post-teen pop": "pop",
"soft rock": "rock",
})
Create the list with genres sorted in descending order of occurrence.
genre_list = pd.DataFrame(df_artists["Genre"].value_counts())
Filter genre as described above.
genre_list_top_vals = genre_list[genre_list["Genre"] >= 15]
df_artists["Genre"] = df_artists["Genre"].apply(lambda x: x if x in genre_list_top_vals.index else None)
df_artists["Rank"] = df_artists.index
df_artists.to_csv(PATH_ARTISTS_CLEAN)
df_tracks["Artist Names"][0]
df_lyrics_with_genre = pd.merge(left=df_lyrics,
right=df_artists[["Artist Name","Genre"]],
left_on="Artist",
right_on="Artist Name").drop("Artist Name",axis=1)
df_lyrics = df_lyrics_with_genre.copy()
# This block is used to identify the "main" artist of each track based on whose album the track is featured on.
# This artist's genre is then assigned to the song.
df_albums = pd.read_csv(PATH_ALBUMS,index_col=0)
df_tracks_helper = pd.merge(left=df_tracks,
right=df_albums[["Album ID","Artist ID"]],
left_on="Album ID",
right_on="Album ID")
df_tracks_with_genre = pd.merge(left=df_tracks_helper,
right=df_artists[["Artist_ID","Genre"]],
left_on="Artist ID",
right_on="Artist_ID")
df_tracks = df_tracks_with_genre.copy()
df_tracks["Track Name"].value_counts()
Several titles appear more than once. However, a title appearing more than once is not necessarily a duplicate, since two different artists can have songs that go by the same titles. Let's visualise the frequency of titles going by the same name occuring.
def draw_duplicate_graph(count_list,bins=[0,1,3,10,100000],labels=["1","2-3","4-10",">10"]):
out = pd.cut(count_list, bins=bins, labels=labels)
ax = out.value_counts(sort=False).plot.bar(rot=0, color="b", figsize=(6,4))
plt.show()
draw_duplicate_graph(df_tracks["Track Name"].value_counts())
It can be seen that for the majority of titles, duplicates are no problem at all. Yet, there is still a considerable amount of titles that appear several times for the same artist.
df_tracks[df_tracks["Track Name"] == 'Violin Concerto No.2 In E, BWV 1042: 3. Allegro assai']["Popularity"]
When looking at the popularity of each instance of 'Violin Concerto No.2 In E, BWV 1042: 3. Allegro assai', which is among the most frequently occuring titles, is becomes appearant that lots of its versions are not listented to very often. For that reason, the first strategy of removing duplicates is by removing all songs from the dataset that have a popularity less then five. This operation also affect non-duplicate songs that are very unpopular which is an intended side-effect to reduce the dataset size and keep it to the most relevant tracks.
df_tracks_pop = df_tracks[df_tracks["Popularity"] > 5]
print(f'Nr. of duplicate titles before: {sum(df_tracks["Track Name"].value_counts() > 1)}')
print(f'Nr. of duplicate titles after: {sum(df_tracks_pop["Track Name"].value_counts() > 1)}')
draw_duplicate_graph(df_tracks_pop["Track Name"].value_counts())
This step reduced the number of duplicate titles by more than 20,000.
Let's now have a closer look on "Intro", which is the title occuring most frequently.
df_tracks_pop[df_tracks_pop["Track Name"] == "Intro"].head()
Some of those tracks are obviously duplicates in the sense that they are the very same song appearing multiple times in the dataset. For example, looking at indices 2165 and 2201, the tracks have the same title, artist, album, and even duration. They are also very similar (albeit not completely the same) in term of their audio features. Similar observations can be made for many other songs in the dataset as well. In all those cases, keeping both/all songs in the dataset would not make a lot of sense. To remove those duplicates, the following procedure is applied:
Below is an example on how dropping the duplicates works illustrated based on tracks called "Intro".
# Only keep Artist Names, Track Name, and Popularity from dataframe filtered by "Intro"
df_tracks_remove_dups = df_tracks_pop[df_tracks_pop["Track Name"]=="Intro"][["Artist Names","Track Name","Popularity"]]
# Group based on Artist Names and Track Name and sort by Popularity
df_tracks_remove_dups["Artist Names"] = df_tracks_remove_dups["Artist Names"].astype(str)
df_tracks_remove_dups_sorted = df_tracks_remove_dups.sort_values(["Artist Names","Track Name","Popularity"],ascending=[False,False,False])
# Print the dataframe filtered by "Intro"
df_tracks_remove_dups_sorted
# Remove duplicates in each Artist/Track Name pairing, only keeping the first instance
df_unique_songs = df_tracks_remove_dups_sorted.drop_duplicates(subset=['Artist Names', 'Track Name'], keep='first')
df_unique_songs.head()
Now let's apply this procedure to the entire dataset.
df_tracks_pop["Artist Names"] = df_tracks_pop["Artist Names"].astype(str)
df_tracks_sorted = df_tracks_pop.sort_values(["Artist Names","Track Name","Popularity"],ascending=[False,False,False])
df_tracks_no_dups = df_tracks_sorted.drop_duplicates(subset=['Artist Names', 'Track Name'], keep='first')
df_tracks_no_dups
# Check if the results are same as before
df_tracks_no_dups[df_tracks_no_dups["Track Name"] == "Intro"].head()
df_tracks_no_dups["Track Name"].value_counts()[:20]
draw_duplicate_graph(df_tracks_no_dups["Track Name"].value_counts())
This conludes the preprocessing / duplicate removal for the tracks dataset. As can be seen above, there are still lots of titles that appear several times. However, in all cases it seems reasonable to assume that they were mostly performed by different artists and can therefore be considered unique songs.
df_tracks_no_dups = df_tracks_no_dups.sort_index()
df_tracks_no_dups.to_csv(PATH_TRACKS_CLEAN)
# Read in the data
df_tracks = pd.read_csv(PATH_TRACKS_CLEAN, index_col=0,
converters = {"Artist Names": ast.literal_eval})
df_artists = pd.read_csv(PATH_ARTISTS_CLEAN, index_col=0,
converters = {"Genres": ast.literal_eval})
Since the tracks number is a external number, it is recalculated with the songs availabe to us from the fetching.
df_track_all_artists = df_tracks.explode("Artist Names")
All songs from artists which are part of df_artists are kept, grouped and counted.
df_artist_tracks = pd.DataFrame(df_track_all_artists[df_track_all_artists["Artist Names"].\
isin(df_artists["Artist Name"])].\
groupby(["Artist Names"])["Track Name"].count())
# Rename Column
df_artist_tracks = df_artist_tracks.rename({"Track Name": "Tracks Count"}, axis=1)
df_nodes = pd.merge(df_artists, df_artist_tracks, left_on="Artist Name", right_on="Artist Names", how="left")
Create helper column with the amount of artists collaborating on a song which is the criterion used to filter the data later on.
df_tracks["Artist Count"] = df_tracks["Artist Names"].apply(lambda x: len(x))
df_artist_collabo_tracks = df_tracks.explode("Artist Names")
The exploded data is filter by the amount of artists taking part in the track production. If more than one, the data is kept.
df_artist_collabo_tracks = df_artist_collabo_tracks[df_artist_collabo_tracks["Artist Count"] > 1]
Same as before. If the artist can be found in df_artists, it is kept, ground and the amount counted.
df_artist_collabo_tracks = pd.DataFrame(df_artist_collabo_tracks[df_artist_collabo_tracks["Artist Names"].\
isin(df_artists["Artist Name"])].\
groupby(["Artist Names"])["Track Name"].count())
Rename column for merging.
df_artist_collabo_tracks = df_artist_collabo_tracks.rename({"Track Name": "Tracks Collabo Count"}, axis=1)
Data merging.
df_nodes = pd.merge(df_nodes, df_artist_collabo_tracks, left_on="Artist Name",
right_on="Artist Names", how="left")
Now the share of collaboration from the overall songs produced per artists can be produced.'
df_nodes["Tracks Collabo Share"] = df_nodes["Tracks Collabo Count"]/df_nodes["Tracks Count"]
df_nodes.drop("Tracks Collabo Count", axis=1, inplace=True)
df_nodes["Tracks Count"].fillna(0, inplace=True)
df_nodes["Tracks Collabo Share"].fillna(0, inplace=True)
The artist name Joey Bada$$ leads to issues later on with nx.draw_kamada_kawai. Therefore the artist is renamed
df_nodes.loc[df_nodes["Artist Name"] == "Joey Bada$$", "Artist Name"] = "Joey Badass"
Each genre gets a color assigned for the visualization later on. The colors are taken from a HEX palette generator and tried to take an opposing color for every following element to generate better contrast in network later on
genre_list_top_vals["Genre Color"] = ["#cd6155",
"#566573",
"#99a3a4",
"#3bc14a",
"#5499c7",
"#057476",
"#2980b9",
"#bb8fce",
"#ffe900"]
Filtering down the artists in df_artists if they are part of the top gernres seleted. From now on df_nodes is considered the "master data" for the network analysis.
df_nodes = df_nodes[df_nodes["Genre"].isin(list(genre_list_top_vals.index))]
The genre colors are added to df_nodes by joining them on the genre.
df_nodes
genre_list_top_vals
df_nodes = pd.merge(df_nodes,
genre_list_top_vals,
left_on="Genre",
right_index=True,
how="left",
suffixes=('', '_y')).drop(columns=["Genre_y"])
The popularity score and number of tracks by each artist could lead to interesting insights in the analysis, however, they are too granular. Therefore, new columns with bins are added.
Look at the characteristics of Popularity variable.
df_nodes["Popularity"].describe()
sns.distplot(df_nodes["Popularity"])
Calculate the range of values.
df_nodes["Popularity"].max() - df_nodes["Popularity"].min()
Since it is a low range and is somehow normally distributed, a linear assignment to the bins is chosen. To have a clear cutoff three or six bins seem ideal. Three bins are chosen two get a broader picture.
Note: The numbers are added to avoid manuel sorting later on.
df_nodes["Popularity Bins"] = pd.cut(df_nodes["Popularity"], 3, labels=["3: Less Popular",
"2: Popular",
"1: Very Popular"])
Look at the characteristics of Tracks Count variable.
df_nodes["Tracks Count"].describe()
sns.distplot(df_nodes["Tracks Count"])
df_nodes["Tracks Count"].max()
df_nodes[df_nodes["Tracks Count"] == 3416]
In contrast to the popularity, the the artists produce a very different amount of tracks. It shows a right skewed distribution and the most diligent artist in the dataset, Grateful Dead, produce 3543 songs, where the average artist produced only 147 songs. Therefore, bins are create manually at own descretion to cover different artist types best.
def create_track_bins(df_artists):
df_artists["Tracks Bins"] = 0
df_artists.loc[df_artists["Tracks Count"] <= 25, "Tracks Bins"] = "1: Up to 25"
df_artists.loc[(df_artists["Tracks Count"] > 25) & (df_artists["Tracks Count"] <= 50),
"Tracks Bins"] = "2: Up to 50"
df_artists.loc[(df_artists["Tracks Count"] > 50) & (df_artists["Tracks Count"] <= 100),
"Tracks Bins"] = "3: Up to 100"
df_artists.loc[(df_artists["Tracks Count"] > 100) & (df_artists["Tracks Count"] <= 200),
"Tracks Bins"] = "4: Up to 200"
df_artists.loc[(df_artists["Tracks Count"] > 200) & (df_artists["Tracks Count"] <= 300),
"Tracks Bins"] = "5: Up to 300"
df_artists.loc[(df_artists["Tracks Count"] > 300) & (df_artists["Tracks Count"] <= 500),
"Tracks Bins"] = "6: Up to 500"
df_artists.loc[df_artists["Tracks Count"] > 500, "Tracks Bins"] = "7: Over 500"
Applying bin creation for tracks.
create_track_bins(df_nodes)
df_nodes.to_csv(PATH_NODES)
def create_edgelist(artist_list):
"""
Creates a weighted edgelist based on artist collaborations.
Parameters:
artist_list (list): List of lists of artists who collaborated on songs.
Returns:
df_edgelist (dataframe): Pandas dataframe that shows number of collaboations between all artists.
"""
collaboration_dict = defaultdict(int)
# Iterate over all artist collaborations
for collaboration in artist_list:
# Check whether more than one artist created the song
if len(collaboration) > 1:
# Get all possible combinations of artists
combinations = list(itertools.combinations(sorted(collaboration), 2))
# Increment count of artist collaborations by one
for combination in combinations:
collaboration_dict[combination] += 1
# Convert dict to dataframe
df_edgelist = pd.Series(collaboration_dict).reset_index()
df_edgelist.columns = ['Artist_1', 'Artist_2', 'Weight']
# Necessary to reduce the absolute value since they resulted in too broad lines in the visualizations
df_edgelist["Weight for Viz"] = df_edgelist["Weight"]/10
# Return edgelist dataframe
return df_edgelist
Call function to generate edgelist for all artists.
df_edgelist = create_edgelist(list(df_tracks["Artist Names"]))
len(df_edgelist)
Unfilterted the network holds over 37 thousand edges in its network.
By applying a filter, which excluded all artists that cannot be assigned to the top 1000 artists for both nodes, the number of edges drops significantly. This step is taken and even narrowed down further by only including the artists which were left after the genre filtering.
df_edgelist = df_edgelist[(df_edgelist["Artist_1"].isin(df_nodes["Artist Name"])) &
(df_edgelist["Artist_2"].isin(df_nodes["Artist Name"]))]
len(df_edgelist)
By doing so, the network which is further analyzed holds only around 4.4 thousand edges compared to over 37 thousand before any filtering.
This concludes the preprocessing of the data used in the network analysis. However, in the following steps the dataframe is enriched with additional data generated by the network.
df_edgelist.to_csv(PATH_EDGES)
def verse_info_removal(df_col):
"""
Removes all verse info provided in square brackets such as "[intro]" or "[chorus]" from the string and returns
the cleaned string.
input:
df_col (str):
output:
cleaned string
"""
print("Removing verse info...\n")
return df_col.apply(lambda x: re.sub(r"\[(.*?)\]", "", str(x)))
def detect_language(df_col):
lans = []
for index,text in tqdm(enumerate(df_col)):
try:
lans.append(detect(text))
except:
print(f'Exception thrown for text "{text}"at index {index}.')
lans.append("Not classified")
return lans
def tokenization(df_col):
"""
Takes a list with strings and returns a list with tokens
"""
print("Tokenizing words...\n")
return df_col.apply(lambda x: word_tokenize(x))
def not_appender(df_col):
"""
Takes from the tokens the "n't" as result from negative contradictions and append it to the following word.
The adjusted token list is returned.
"""
print("Including negation in words...\n")
df_col = df_col.apply(lambda x: " ".join(x))
df_col = df_col.apply(lambda x: x.replace("n't ", "not_"))
df_col = df_col.apply(lambda x: x.split(" "))
return df_col
def only_alphabetic(df_col):
"""
Keeps only tokens which are alphabetic or an underscore and returns them.
"""
print("Removing all non-alphabetic words...\n")
return df_col.apply(lambda x: [token for token in x if re.match("^[a-zA-Z0_]*$", token)])
def lowercase(df_col):
"""
Returns all tokens in lowercase.
"""
print("Making all words lowercase...\n")
return df_col.apply(lambda x: [token.lower() for token in x])
stop_words = set(stopwords.words('english'))
stop_words.update(["yeah","wanna","oh","ooh","la","lala","lalala", "got","na","ayy","yo","tryna","damn","huh","ai","ayy"])
#"like","know","time","never","back","want","make","come","take","feel","right","need", # removed from top words
#"could","tell","live","keep","would","thing","still","said","every","little","around",
#"cause","really","well","something"])
def stopword_removal(df_col):
"""
Removes all words considered as stopwords and all words that have a length of three or less.
"""
print("Removing Stopwords...\n")
return df_col.apply(lambda x: [token for token in x if token not in stop_words and len(token) > 3])
def lemmatization(df_col):
"""
Applies lemmatization to all tokens and returns them afterwards.
"""
print("Lemmatizing words...\n")
lemmatizer = WordNetLemmatizer()
return df_col.apply(lambda x: [lemmatizer.lemmatize(token) for token in x])
def preprocessing(df, tokenized_YN = 1, *steps):
"""
Takes in a dataframe and selects the column with the text and applies preprocessing steps given
in and returns either tokens or a string, as determined.
Input:
- df (dataframe): The dataframe containing the text column.
- tokenzied (boolean): Returns if 1 a list of tokens, for 0 a string.
- steps (functions): Multiple functions for preprocessing can be given in.
Output:
- Either a list with lists or a list with strings.
"""
# copying over the column for preprocessing
temp = df["Lyrics Raw"].copy()
for func in steps:
temp = func(temp)
return temp if tokenized_YN == 1 else temp.apply(lambda x: " ".join([token for token in x]))
df_lyrics = df_lyrics_with_genre.copy()
df_lyrics.head()
# Rename Lyrics column to Lyrics Raw
df_lyrics.rename(columns={"Lyrics":"Lyrics Raw"},inplace=True)
# Remove all content inside square brackets and \n from Lyrics
df_lyrics["Lyrics"] = verse_info_removal(df_lyrics["Lyrics Raw"]).apply(lambda x: re.sub(r"\n", ". ", str(x)))
# Strip all leading and trailing whitespace
df_lyrics["Lyrics"] = df_lyrics['Lyrics'].str.strip()
# Remove all empty srings
df_lyrics = df_lyrics[df_lyrics['Lyrics'].map(len) > 0]
# Detect language of all song lyrics (function takes quite long to execute)
df_lyrics["Language"] = detect_language(df_lyrics["Lyrics"])
# Print number of occurences of each genre
df_lyrics["Language"].value_counts()
# Only keep song lyrics in english (which also removes not classified songs)
df_lyrics = df_lyrics[df_lyrics["Language"] == "en"].reset_index(drop=True)
# Remove language column
df_lyrics = df_lyrics.drop("Language",axis=1)
# Create column with preprocessed and tokenized lyrics
df_lyrics["Lyrics Clean Tok"] = preprocessing(df_lyrics,
1,
verse_info_removal,
tokenization,
not_appender,
only_alphabetic,
lowercase,
stopword_removal,
lemmatization)
# Create column with preprocessed and un-tokenized lyrics
df_lyrics["Lyrics Clean No Tok"] = preprocessing(df_lyrics,
0,
verse_info_removal,
tokenization,
not_appender,
only_alphabetic,
lowercase,
stopword_removal,
lemmatization)
df_lyrics.describe()
# Drop empty rows
df_lyrics = df_lyrics.dropna(axis=0).reset_index(drop=True)
# Create column with token count of songs
df_lyrics["Word Count"] = df_lyrics["Lyrics"].str.split().map(lambda x: len(x))
# Drop rows with less than 20 tokens
df_lyrics = df_lyrics[df_lyrics["Word Count"] >= 20].reset_index(drop=True)
df_lyrics["Genre"].value_counts()
German hip hop is hardly represented as a genre in the lyrics dataset, which makes sense given that all non-English tracks are supposed to be filtered out. Taking a closer look at the lyrics from this gernre, it is appearant that those songs have probably been falsely classified to be English (probably due to the mix of English and German words) and are therefore dropped.
for lyric in df_lyrics[df_lyrics["Genre"] == "german hip hop"]["Lyrics"]:
print(lyric)
# Drop tracks that belong to German hip hop
df_lyrics = df_lyrics[df_lyrics["Genre"] != "german hip hop"].reset_index(drop=True)
df_lyrics.head()
df_lyrics.shape
df_lyrics.to_csv(PATH_LYRICS_CLEAN)
df_tracks = pd.read_csv(PATH_TRACKS_CLEAN, index_col=0, converters = {"Artist Names": ast.literal_eval})
df_tracks.head()
df_tracks['Explicit'] = df_tracks["Explicit"].astype(int)
df_tracks['Year'] = pd.DatetimeIndex(df_tracks['Release Date']).year
The selected audio features are numeric values provided by the Spotify API
audio_features = ['Popularity', 'Acousticness', 'Danceability', 'Energy', 'Instrumentalness', 'Liveness', 'Loudness',
'Speechiness', 'Valence', 'Tempo', 'Key', 'Mode', 'Year', 'Explicit']
df_tracks_af = df_tracks[audio_features]
# Drop tracks for which audio features are not available
df_tracks_af = df_tracks_af.dropna(axis=0)
plt.figure(figsize=(20, 10))
sns.heatmap(df_tracks_af.corr(),annot = True, fmt='.1g',square=True)
We can use a correlation plot to check how the audio feature are related with popularity
corr = np.abs(df_tracks_af.corr())
series = np.abs(corr['Popularity']).sort_values(ascending=False)
print('The most linear correlated features to POPULARITY are:')
for i, row in enumerate(series):
if 0.2 <= row < 1:
print(f'{series.index[i]:17} --> {row: .2f} (abs)')
Also, possible outliers are detected and, if present, dropped
plt.figure(figsize=(16, 10))
for i in range(len(df_tracks_af.columns)):
plt.subplot(3, 5, i + 1)
sns.boxplot(df_tracks_af[df_tracks_af.columns[i]])
plt.show()
def outliers_count(df, threshold):
#function that, starting from a data frame and a treshold detects
#returns number of outliers for each column.
df = df_tracks_af.copy()
# Get the z-score for specified threshold
threshold_z_score = stats.norm.ppf(threshold)
# Get the z-scores for each value in df
z_score_df = pd.DataFrame(np.abs(stats.zscore(df)), columns=df.columns)
# Compare df z_scores to the threshold and return the count of outliers in each column
return (z_score_df > threshold_z_score).sum(axis=0)
outliers_count(df_tracks_af, 0.99999)
def outliers_cleaner(df, threshold):
#function that starting from a datafram and a given treshold for outliers,
#return a datafram without the outliers identified
df = df.copy()
# Get the z-score for specified threshold
threshold_z_score = stats.norm.ppf(threshold)
# Get the z-scores for each value in df
z_score_df = pd.DataFrame(np.abs(stats.zscore(df)), columns=df.columns)
z_score_df = z_score_df > threshold_z_score
# Get indices of the outliers
outliers = z_score_df.sum(axis=1)
outliers = outliers > 0
outlier_indices = df.index[outliers]
# Drop outlier examples
df = df.drop(outlier_indices, axis=0).reset_index(drop=True)
return df
df_tracks_af_clean = outliers_cleaner(df_tracks_af,0.99999)
In the following section some of the most relevant music feature are analyzed to better undestand how they affect popularity
fig, ax = plt.subplots(figsize=(16, 4))
sns.distplot(df_tracks['Acousticness'], kde=False, bins=30)
plt.show()
fig, ax = plt.subplots(figsize=(15, 6))
ax1_data = df_tracks.groupby('Acousticness')['Popularity'].mean().to_frame().reset_index()
ax = sns.scatterplot(x = ax1_data['Acousticness'], y = ax1_data['Popularity'], color='blue', ax=ax)
ax.set_title('Acousticness vs. Mean Popularity')
ax.set_ylabel('Mean Popularity', fontsize=12)
plt.tight_layout()
plt.show()
It seems that the less acustic a song is, the more it is popular on average
fig, ax = plt.subplots(figsize=(16, 4))
sns.distplot(df_tracks['Loudness'], kde=False, bins=30)
plt.show()
fig, ax = plt.subplots(figsize=(15, 6))
ax1_data = df_tracks.groupby('Loudness')['Popularity'].mean().to_frame().reset_index()
ax = sns.scatterplot(x = ax1_data['Loudness'], y = ax1_data['Popularity'], color='blue', ax=ax)
ax.set_title('Loudness vs. Mean Popularity')
ax.set_ylabel('Mean Popularity', fontsize=12)
plt.tight_layout()
plt.show()
Values of Loudness between 0 and -5 seem to be the most popular ones. Also, it is not uncommon for songs to be in that range of Loudness
fig, ax = plt.subplots(figsize=(16, 4))
sns.distplot(df_tracks['Instrumentalness'], kde=False, bins=30)
plt.show()
fig, ax = plt.subplots(figsize=(15, 6))
ax1_data = df_tracks.groupby('Instrumentalness')['Popularity'].mean().to_frame().reset_index()
ax = sns.scatterplot(x = ax1_data['Instrumentalness'], y = ax1_data['Popularity'], color='blue', ax=ax)
ax.set_title('Instrumentalness vs. Mean Popularity')
ax.set_ylabel('Mean Popularity', fontsize=12)
plt.tight_layout()
plt.show()
The vast majority of the tracks seem to have an Instrumentalness value close to 0
fig, ax = plt.subplots(figsize=(16, 4))
sns.distplot(df_tracks['Energy'], kde=False, bins=30)
plt.show()
fig, ax = plt.subplots(figsize=(15, 6))
ax1_data = df_tracks.groupby('Energy')['Popularity'].mean().to_frame().reset_index()
ax = sns.scatterplot(x = ax1_data['Energy'], y = ax1_data['Popularity'], color='blue', ax=ax)
ax.set_title('Energy vs. Mean Popularity')
ax.set_ylabel('Mean Popularity', fontsize=12)
plt.tight_layout()
plt.show()
Energy values are more equally distributed. Between 0.2 and 0.8 there is a linear relationship between Popularity and Danceability
fig, ax = plt.subplots(figsize=(16, 4))
sns.distplot(df_tracks['Danceability'], kde=False, bins=30)
plt.show()
fig, ax = plt.subplots(figsize=(15, 6))
ax1_data = df_tracks.groupby('Danceability')['Popularity'].mean().to_frame().reset_index()
ax = sns.scatterplot(x = ax1_data['Danceability'], y = ax1_data['Popularity'], color='blue', ax=ax)
ax.set_title('Danceability vs. Mean Popularity')
ax.set_ylabel('Mean Popularity', fontsize=12)
plt.tight_layout()
plt.show()
Despite the normal distribution, like energy, danceability seems to be lienarly realted with popularty between 0.2 and 0.8
df_tracks['Popularity'].describe()
fig, ax = plt.subplots(figsize=(16, 4))
sns.distplot(df_tracks['Popularity'], kde=False, bins=30)
plt.show()
fig, ax = plt.subplots(figsize=(20, 4))
ax = df_tracks.groupby('Year')['Track Name'].count().plot()
ax.set_title('Number of tracks over the years', weight='bold')
ax.set_ylabel('Number of Tracks', weight='bold')
ax.set_xlabel('Year', weight='bold')
ax.set_xticks(range(1920, 2021, 5))
plt.show()
The vast majority of the songs are recent. Also, very few song have high level of popularity
df_tracks.groupby('Year')['Track Name'].count()
frequency_year = df_tracks.groupby('Year')['Track Name'].count().reset_index()
frequency_year.columns = ['Year','Number of tracks']
frequency_year['%'] = (frequency_year['Number of tracks']/(frequency_year['Number of tracks'].sum()))*100
frequency_year
fig, ax = plt.subplots(figsize=(20, 4))
ax = df_tracks.groupby('Year')['Popularity'].max().plot()
ax.set_title('Max Popularity over the years', c='r', weight='bold')
ax.set_ylabel('Max Popularity', weight='bold')
ax.set_xlabel('Year', weight='bold')
ax.set_xticks(range(1920, 2021, 5))
plt.show()
fig, ax = plt.subplots(figsize = (12, 10))
top_songs = df_tracks.groupby('Track Name')['Popularity'].max().sort_values(ascending=False).head(25)
ax = sns.barplot(x=top_songs.values, y=top_songs.index, orient="h", edgecolor='black', ax=ax)
ax.set_xlabel('Popularity', fontsize=12)
ax.set_ylabel('Track', fontsize=12)
ax.set_title('Most Popular Tracks', fontsize=14, weight = 'bold')
plt.show()
Setting the values for split test base on the findings of EDA and correlation plot
y = df_tracks_af_clean['Popularity']
X_all = df_tracks_af_clean.drop('Popularity', axis=1) #including all the numeric features
X = df_tracks_af_clean.drop(['Popularity','Key','Tempo','Mode',
'Liveness','Speechiness', 'Valence','Year'],
axis=1) #including only features selected in EDA
Defining the X variables for Statsmodel, which requires a constant added to the data frame
X_stats = sm.add_constant(X)
X_all_stats = sm.add_constant(X_all)
The first attempt in popularity prediction aims to predict popularityover all the years present in the data
The model is firstly fitted with all the music features
X_train, X_test, y_train, y_test = train_test_split(X_all_stats, y, test_size=0.2, random_state=RANDOM)
X_train.describe()
model = sm.OLS(y_train, X_train).fit()
model_summary=model.summary()
print(model_summary)
The model can explain less than 1/4 of the dependand variable with all the music features. Key seems to not be statistically relevant.
predictions = model.predict(X_test)
mse = sm.tools.eval_measures.mse(y_test, predictions)
rmse = np.sqrt(mse)
print('Mean squared error: %.2f'
% mse)
print ('Root mean squared error: %.2f'
% rmse)
The general model can predict popularity with a RMSE of 14.91, not too accuarate in a scale from 0 to 100
# Plot outputs
fig, ax = plt.subplots(figsize=(6, 6))
ax = sns.scatterplot(x=y_test, y=predictions)
sns.lineplot(x=y_train, y=y_train, color='black', ax=ax)
ax.set_xlabel('Y_test')
ax.set_ylabel('Y_pred')
ax.set_title('y_test vs. y_pred', fontsize=14, color='black')
plt.show()
The second model is fitted with only the music features recognized relevant in the EDA section
X_train, X_test, y_train, y_test = train_test_split(X_stats, y, test_size=0.2, random_state=RANDOM)
model = sm.OLS(y_train, X_train).fit()
model_summary=model.summary()
print(model_summary)
predictions = model.predict(X_test)
resids = model.resid
mse = sm.tools.eval_measures.mse(y_test, predictions)
rmse = np.sqrt(mse)
print('Mean squared error: %.2f'
% mse)
print ('Root mean squared error: %.2f'
% rmse)
# Plot outputs
fig, ax = plt.subplots(figsize=(6, 6))
ax = sns.scatterplot(x=y_test, y=predictions)
sns.lineplot(x=y_train, y=y_train, color='black', ax=ax)
ax.set_xlabel('Y_test')
ax.set_ylabel('Y_pred')
ax.set_title('y_test vs. y_pred', fontsize=14, color='black')
plt.show()
As analyzed in the EDA for popularity prediction, the release year seems to be a very influential variable. Popularity is calculated starting from the number of 'plays' a song got. Therefore, most recent songs are more likely to be popular as they are likely to be played by a higher number of listeners. Also, the vast majority of songs are from 2020. Therefore, popularity will be predicted considering only songs from 2020. The reasons for this choice are two folds. On the one hand, considering all the years does not seem to provide quality results. On the other hand, as popularity seems highly related to how recent a song is, predicting popularity starting from recent songs, seems reasonable.
df_tracks_af_2020 = df_tracks_af_clean[df_tracks_af_clean['Year'] == 2020]
df_tracks_af_2020.describe()
df_tracks_af_clean.describe()
y = df_tracks_af_2020['Popularity']
X_all = df_tracks_af_2020.drop('Popularity', axis=1) #including all the numeric features
X = df_tracks_af_2020.drop(['Popularity','Key','Tempo','Mode',
'Liveness','Speechiness', 'Valence', 'Year'],
axis=1) #including only features selected in EDA
Defining the X variables for Statsmodel, which requires a constant added to the data frame
X_stats = sm.add_constant(X)
X_all_stats = sm.add_constant(X_all)
X_train, X_test, y_train, y_test = train_test_split(X_all_stats, y, test_size=0.2, random_state=RANDOM)
model = sm.OLS(y_train, X_train).fit()
model_summary=model.summary()
print(model_summary)
The model can now explain 55.2% of the dependand variable with all the music features.A signifcant improvement from the model considering all the years. Also, in this new model it's possible to see how Key, Tempo and Mode don't seem to be statistically relevant
predictions = model.predict(X_test)
mse = sm.tools.eval_measures.mse(y_test, predictions)
rmse = np.sqrt(mse)
print('Mean squared error: %.2f'
% mse)
print ('Root mean squared error: %.2f'
% rmse)
The general model can predict popularity with a RMSE of 14.91, not too accuarate in a scale from 0 to 100
# Plot outputs
fig, ax = plt.subplots(figsize=(6, 6))
ax = sns.scatterplot(x=y_test, y=predictions)
sns.lineplot(x=y_train, y=y_train, color='black', ax=ax)
ax.set_xlabel('Y_test')
ax.set_ylabel('Y_pred')
ax.set_title('y_test vs. y_pred', fontsize=14, color='black')
plt.show()
residuals = y_test - predictions
sns.mpl.rcParams['figure.figsize'] = (15.0, 7.0)
fig, ax = plt.subplots(1,2 )
sns.regplot(x=predictions, y=y_test, lowess=True, ax=ax[0], line_kws={'color': 'red'})
ax[0].set_title('Observed vs. Predicted Values', fontsize=16)
ax[0].set(xlabel='Predicted', ylabel='Observed')
sns.regplot(x=predictions, y=residuals, lowess=True, ax=ax[1], line_kws={'color': 'red'})
ax[1].set_title('Residuals vs. Predicted Values', fontsize=16)
ax[1].set(xlabel='Predicted', ylabel='Residuals')
X_train, X_test, y_train, y_test = train_test_split(X_stats, y, test_size=0.2, random_state=RANDOM)
model = sm.OLS(y_train, X_train).fit()
model_summary=model.summary()
print(model_summary)
predictions = model.predict(X_test)
residuals = y_test - predictions
mse = sm.tools.eval_measures.mse(y_test, predictions)
rmse = np.sqrt(mse)
print('Mean squared error: %.2f'
% mse)
print ('Root mean squared error: %.2f'
% rmse)
# Plot outputs
fig, ax = plt.subplots(figsize=(6, 6))
ax = sns.scatterplot(x=y_test, y=predictions)
sns.lineplot(x=y_train, y=y_train, color='black', ax=ax)
ax.set_xlabel('Y_test')
ax.set_ylabel('Y_pred')
ax.set_title('y_test vs. y_pred', fontsize=14, color='black')
plt.show()
sns.mpl.rcParams['figure.figsize'] = (15, 7)
fig, ax = plt.subplots(1,2 )
sns.regplot(x=predictions, y=y_test, lowess=True, ax=ax[0], line_kws={'color': 'red'})
ax[0].set_title('Observed vs. Predicted Values', fontsize=16)
ax[0].set(xlabel='Predicted', ylabel='Observed')
sns.regplot(x=predictions, y=residuals, lowess=True, ax=ax[1], line_kws={'color': 'red'})
ax[1].set_title('Residuals vs. Predicted Values', fontsize=16)
ax[1].set(xlabel='Predicted', ylabel='Residuals')
It seems that none of the models does not follow a linear behaviour in predicting popularity between 0 and 30. Also, despite the improvements accurate prediction is not possible yet. Therefore some more models will be considered. Lasso and Ridge regression could reduce erros, so they will be tested next
X_train, X_test, y_train, y_test = train_test_split(X_all, y, test_size = 0.3, random_state=RANDOM)
#grid search for hypertuning parameters
param_grid = {'alpha': np.arange(0.1, 50)}
lasso = Lasso()
lasso_cv = GridSearchCV(lasso, param_grid, cv=5)
lasso_cv.fit(X_train, y_train)
lasso_cv.best_params_
#lasso
lasso = Lasso(alpha=0.1)
lasso.fit(X_train, y_train)
lasso_pred = lasso.predict(X_test)
lasso.score(X_test, y_test)
print('Mean squared error: %.2f'
% mean_squared_error(y_test, lasso_pred))
print('Mean RMSE: %.2f'
% np.sqrt(metrics.mean_squared_error(y_test,lasso_pred)))
print('Coefficient of determination (1 is perfect prediction): %.4f'
% r2_score(y_test,lasso_pred))
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.3, random_state=RANDOM)
#grid search
param_grid = {'alpha': np.arange(0.1, 50)}
lasso = Lasso(normalize=False)
lasso_cv = GridSearchCV(lasso, param_grid, cv=5)
lasso_cv.fit(X_train, y_train)
lasso_cv.best_params_
lasso = Lasso(alpha=0.1, normalize=False)
lasso.fit(X_train, y_train)
lasso_pred = lasso.predict(X_test)
lasso.score(X_test, y_test)
print('Mean squared error: %.2f'
% mean_squared_error(y_test, lasso_pred))
print('Mean RMSE: %.2f'
% np.sqrt(metrics.mean_squared_error(y_test,lasso_pred)))
print('Coefficient of determination (1 is perfect prediction): %.4f'
% r2_score(y_test,lasso_pred))
X_train, X_test, y_train, y_test = train_test_split(X, y,test_size = 0.3, random_state=RANDOM)
#grid search
param_grid = {'alpha': np.arange(0.1, 40)}
ridge = Ridge(normalize=False)
ridge_cv = GridSearchCV(ridge, param_grid, cv=5)
ridge_cv.fit(X, y)
ridge_cv.best_params_
ridge = Ridge(alpha=39.1, normalize=False)
ridge.fit(X_train, y_train)
ridge_pred = ridge.predict(X_test)
ridge.score(X_test, y_test)
print('Mean squared error: %.2f'
% mean_squared_error(y_test, ridge_pred))
print('Mean RMSE: %.2f'
% np.sqrt(metrics.mean_squared_error(y_test,ridge_pred)))
print('Coefficient of determination (1 is perfect prediction): %.4f'
% r2_score(y_test,ridge_pred))
X_train, X_test, y_train, y_test = train_test_split(X_all, y,test_size = 0.3, random_state=RANDOM)
#grid search
param_grid = {'alpha': np.arange(0.1, 40)}
ridge = Ridge(normalize=False)
ridge_cv = GridSearchCV(ridge, param_grid, cv=5)
ridge_cv.fit(X_all, y)
ridge_cv.best_params_
ridge = Ridge(alpha=39.1, normalize=False)
ridge.fit(X_train, y_train)
ridge_pred = ridge.predict(X_test)
ridge.score(X_test, y_test)
print('Mean squared error: %.2f'
% mean_squared_error(y_test, ridge_pred))
print('Mean RMSE: %.2f'
% np.sqrt(metrics.mean_squared_error(y_test,ridge_pred)))
print('Coefficient of determination (1 is perfect prediction): %.4f'
% r2_score(y_test,ridge_pred))
Similarly to what has been done for the statsmodels' linear regression results, yellowbrick is now used to analyze the residual plots of both Lasso and Ridge regression, to understand if regularized linear regressors reduced non-linearity (more about the package function here https://www.scikit-yb.org/en/latest/api/regressor/residuals.html)
X_train, X_test, y_train, y_test = train_test_split(X_all, y, test_size=0.2, random_state=RANDOM)
model = Lasso(alpha=0.1)
visualizer = ResidualsPlot(model, hist=True)
visualizer.fit(X_train, y_train) # Fit the training data to the visualizer
visualizer.score(X_test, y_test) # Evaluate the model on the test data
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=RANDOM)
model = Lasso(alpha=0.1)
visualizer = ResidualsPlot(model, hist=True)
visualizer.fit(X_train, y_train) # Fit the training data to the visualizer
visualizer.score(X_test, y_test) # Evaluate the model on the test data
X_train, X_test, y_train, y_test = train_test_split(X_all, y, test_size=0.2, random_state=RANDOM)
model = Ridge(39.1)
visualizer = ResidualsPlot(model, hist=True)
visualizer.fit(X_train, y_train) # Fit the training data to the visualizer
visualizer.score(X_test, y_test) # Evaluate the model on the test data
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=RANDOM)
model = Ridge(39.1)
visualizer = ResidualsPlot(model, hist=True)
visualizer.fit(X_train, y_train) # Fit the training data to the visualizer
visualizer.score(X_test, y_test) # Evaluate the model on the test data
Similarly to what was observed in the Linear Regression estimators, also Lasso and Ridge do not follow a linear relationship for the lower values of popularity. Therefore, a not-linear regressor is used.
X_train, X_test, y_train, y_test = train_test_split(X_all, y, test_size=0.2, random_state=RANDOM)
#grid search
param_grid = {'n_estimators': np.arange(25, 201, 25)}
rf = RandomForestRegressor()
rf = GridSearchCV(rf, param_grid, cv=3, verbose=10)
rf.fit(X_all, y)
rf.best_params_
RF = RandomForestRegressor(n_estimators=150)
RF.fit(X_train,y_train)
pred = RF.predict(X_test)
mse=mean_squared_error(y_test,pred )
rmse = np.sqrt(mse)
print(mse)
print(rmse)
pd.DataFrame({'Variable':X_train.columns,
'Importance':RF.feature_importances_}).sort_values('Importance', ascending=False)
The random forest regressor gives the lowest error. In order to obtain more information about how it reached its output. Firstly we use both the classifier-owned ‘feature importance’ method and eli5 to understand which features which are the features considered the most relevant by the model. Secondly, we calculate features’ contribution in affecting the weight result (https://towardsdatascience.com/machine-learning-explainability-introduction-via-eli5-99c767f017e2 )
#Permutation Importance
perm = PermutationImportance(RF ,random_state=RANDOM).fit(X_test, y_test)
show_weights(perm, feature_names = list(X_test.columns))
show_weights(RF, feature_names = list(X_test.columns))
show_prediction(RF, X_test.iloc[1],show_feature_values=True)
# Plot outputs
fig, ax = plt.subplots(figsize=(6, 6))
ax = sns.scatterplot(x=y_test, y=pred)
sns.lineplot(x=y_train, y=y_train, color='black', ax=ax)
ax.set_xlabel('Y_test')
ax.set_ylabel('Y_pred')
ax.set_title('y_test vs. y_pred', fontsize=14, color='black')
plt.show()
Despite all the efforts in improving the models and the good improvements obtained so far it seems like it's still not possible for any model to predict popularity above 65. Nonethelss it was possible to identify which are the elements that affects popularity the most thanks to eli5.
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=RANDOM)
#grid search
param_grid = {'n_estimators': np.arange(25, 201, 25)}
rf = RandomForestRegressor()
rf = GridSearchCV(rf, param_grid, cv=3, verbose = 10)
rf.fit(X_all, y)
rf.best_params_
RF = RandomForestRegressor(n_estimators=125)
RF.fit(X_train,y_train)
y_train_pred = RF.predict(X_train).clip(0, 1)
RF.score(X_test,y_test)
mse=mean_squared_error(y_test,pred )
rmse = np.sqrt(mse)
print(mse)
print(rmse)
pd.DataFrame({'Variable':X_train.columns,
'Importance':RF.feature_importances_}).sort_values('Importance', ascending=False)
#Permutation Importance
perm = PermutationImportance(RF ,random_state=RANDOM).fit(X_test, y_test)
show_weights(perm, feature_names = list(X_test.columns))
show_weights(RF, feature_names = list(X_test.columns))
show_prediction(RF, X_test.iloc[1],show_feature_values=True)
# Plot outputs
fig, ax = plt.subplots(figsize=(6, 6))
ax = sns.scatterplot(x=y_test, y=pred)
sns.lineplot(x=y_train, y=y_train, color='black', ax=ax)
ax.set_xlabel('Y_test')
ax.set_ylabel('Y_pred')
ax.set_title('y_test vs. y_pred', fontsize=14, color='black')
plt.show()
Despite all the efforts in improving the models and the good improvements obtained so far it seems like it's still not possible for any model to predict popularity above 65. As previosuly said, artist popularity may affect popularity. Let's test this hypothesis
df_nodes = pd.read_csv(PATH_NODES, index_col=0)
df_edgelist = pd.read_csv(PATH_EDGES, index_col=0)
Create list with variables for the network to hold as node data
node_att_names = ["Rank", "Tracks Bins", "Popularity", "Popularity Bins", "Genre", "Genre Color"]
def generate_networkx(df_artists, df_edgelist, node_att_names):
# since items in df_artists are accessed over index, it needs to be resetted to ensure it does not have
# any missing values in the number series
df_artists.reset_index(inplace=True, drop=True)
G = nx.Graph()
for counter in range(len(df_artists)):
if df_artists.loc[counter, "Artist Name"] in set(pd.concat([df_edgelist["Artist_1"],
df_edgelist["Artist_2"]])):
G.add_node(df_artists.loc[counter, "Artist Name"])
for att_name in node_att_names:
G.nodes[df_artists.loc[counter, "Artist Name"]][att_name] = df_artists.loc[counter, att_name]
for artist_1, artist_2, weight in zip(df_edgelist["Artist_1"],
df_edgelist["Artist_2"],
df_edgelist["Weight for Viz"]):
if artist_1 in G.nodes() and artist_2 in G.nodes():
G.add_edge(artist_1, artist_2, weight = weight)
return G
The overall network is generated.
G = generate_networkx(df_nodes, df_edgelist, node_att_names)
len(df_nodes) - len(G.nodes)
From the nodes dataframe given in, 174 are not connected in the network. They are taken out to avoid noise and to put the focus in the analysis of artists and their behaviour that show connections with other artists.
Narrowring down based on different weights for further analysis later on.
Create network with only edges with a weight more than one
G_two_plus = generate_networkx(df_nodes, df_edgelist[df_edgelist["Weight"]>1], node_att_names)
Create network with only edges with a weight more than five
G_five_plus = generate_networkx(df_nodes, df_edgelist[df_edgelist["Weight"]>5], node_att_names)
Create network with only edges with a weight more than ten
G_ten_plus = generate_networkx(df_nodes, df_edgelist[df_edgelist["Weight"]>9], node_att_names)
Assign with the louvain algorithm nodes a group, based on the network shape.
partition = community_louvain.best_partition(G, random_state=RANDOM)
nx.set_node_attributes(G, partition, "Partition")
Enrich data with centrality measures. Betweenness centrality is left out since it does not really add any value for the case at hand.
degree_centrality = nx.degree_centrality(G)
eigenvector_centrality = nx.eigenvector_centrality_numpy(G, weight="weight")
#betweenness_centrality = nx.betweenness_centrality(G, weight="weight")
nx.set_node_attributes(G, degree_centrality, "Degree Centrality")
nx.set_node_attributes(G, eigenvector_centrality, "Eigenvector Centrality")
#nx.set_node_attributes(G, betweenness_centrality, "Betweenness Centrality")
The data for the network's node are taken and placed in a dataframe.
df_nodes_attributes = pd.DataFrame.from_dict(dict(G.nodes(data=True)),
orient="index").reset_index().rename({"index": "Artist Name"}, axis=1)
Assigning the partition colors as before done for the genres.
partition_colors = pd.DataFrame(df_nodes_attributes["Partition"].value_counts())
# replace the counting with index, since not needed anymore
partition_colors["Partition"] = partition_colors.index
len(partition_colors)
partition_colors["Partition Color"] = ["#cd6155",
"#566573",
"#99a3a4",
"#3bc14a",
"#5499c7",
"#057476",
"#2980b9",
"#bb8fce",
"#dc7633",
"#ffe900",
"#eb984e",
"#4290f5",
"#fcba03"]
partition_list = pd.merge(df_nodes_attributes, partition_colors, on=["Partition"], how="left")
partition_list.drop(columns=["Rank",
"Tracks Bins",
"Popularity",
"Popularity Bins",
"Genre",
"Genre Color"], inplace=True)
Merge Partition df to df_artists
df_nodes = pd.merge(df_nodes, partition_list, on="Artist Name", how="left")
nx.density(G)
With a share of 2.2% of all possible connections in the network, the overall network between the artists worldwide is quite loose. This is logical since there are many different genres out there and net everyone can collaborate.
nx.transitivity(G)
The transitivity, also called clustering coefficient, indicates how locally a network is cluster. It gives out the probability of closed triplets. With around 29.0% this is quite high compared to the overall connectedness and considering the size of the network.
# nx.diameter(G)
The diameter calculates longest of the shortest paths between to nodes in the network and throws an error since the generated network creates multiple unconnected networks. The biggest network could be selected to get a value, but since the measure does not work very well in regards to interprebility for the case at hand anyhow, this is not done.
# nx.average_shortest_path_length(G)
Also the average shortes path length throws and error for the same reasons as the diameter measure. It also is not useful in regards to interpretability and therefore not adjusted and calculated.
Most collaborative Artists in the overall network by weight
df_edgelist = df_edgelist.reset_index()
edges_count = pd.melt(df_edgelist, id_vars=['index','Weight'], value_vars=['Artist_1', 'Artist_2'],
var_name='Listing', value_name='Artists')
Add weight as a measure to the dataframe
df_nodes = pd.merge(df_nodes, edges_count.groupby('Artists')["Weight"].sum(),
left_on="Artist Name", right_index=True, how="left")
pd.DataFrame(df_nodes.groupby("Artist Name")["Weight"].sum().sort_values(ascending=False)).head()
for artist in ["Gucci Mane", "Rick Ross", "Lil Wayne", "Future", "Young Thug"]:
print(df_nodes.loc[df_nodes["Artist Name"] == artist,
["Tracks Collabo Share", "Artist Name"]])
It can directly be noticed that especially Artists from the pop rap/rap genre tend to collaborate heavily for their songs. This is also the case in the overall output if their songs. The range of the top five artist is between around 55% up to 83%.
Most important artists by their centrality measures.
Degree Centrality:
df_nodes.loc[:, ["Artist Name", "Degree Centrality"]].sort_values(by="Degree Centrality",
ascending=False).head(5)
Eigenvector Centrality:
df_nodes.loc[:, ["Artist Name", "Eigenvector Centrality"]].sort_values(by="Eigenvector Centrality",
ascending=False).head(5)
Also the two measures degree centrality and eigenvector centrality support the finding of the weights that the network is highly characterized by the most famous pop rappers and rappers.
Now the network is looked at from the perspective of the partition, which is the result from the louvain algorithm. First the network is visualized with focus of the partition feature.
nx.draw_kamada_kawai(G,
node_color=list(partition.values()))
Since the visualization is static and too packed, its not optimal for gaining insights. In the following, more advanced visualization techniques are used.
# Setting the default figure size a bit larger
defaults = dict(width=750, height=750, padding=0.1,
xaxis=None, yaxis=None)
hv.opts.defaults(
opts.EdgePaths(**defaults), opts.Graph(**defaults), opts.Nodes(**defaults))
graph = hv.Graph.from_networkx(G, nx.layout.fruchterman_reingold_layout).opts(
tools=["hover"],
edge_alpha=0.2,
node_color="Partition",
#node_size="Centrality Degree",
cmap="Set1",
legend_position="right"
)
show(hv.render((graph)))
Holoviews offers better solution since it is dynamic and allows to zoom in. Another option which also offers non-static visualizations is pyvis.
def create_pyvis_graph_partition(df_artists, df_edgelist, height="1500px", width="100%", bgcolor="#222222",
font_color="#1E1C1E", algo="barnes", notebook=True,
buttons_YN=True, edge_width_YN=True, heading="", file_name="pyvis_graph.html"):
"""
Creates a pyvis visualization of a network colored by partition, edge width by weight and node size by
degree centrality.
Input:
- df_artists: dataframe holding the relevant artist data
- df_edgelist: dataframe holding the relevant edges
- height (str): defines height of the graph
- bgcolor (str): defines of the background (HTML color code can be used or simple color naming)
- fontcolor (str): defines the font color (HTML color code can be used or simple color naming)
- algo (str): the options "barnes", "forced" and "hr" are possible options for the algorithm to set up graph
- notebook (boolean): determines if graph is saved as seperate HTML file or shown within the notebook
- buttons_YN (boolean): adds a bar with design setting options on the side
- edge_with_YN (boolean): adjusts the width of the edges based on the weight if turned on
- heading (str): adds a header to the graph
Output:
pyvis graph
"""
# Set up network
g = Network(height=height, width=width,
bgcolor=bgcolor, font_color=font_color,
notebook=notebook, heading=heading)
# Set width of graph depending if setting should be shown on the side, only if not displayed in notebook
# since there results in issues
if notebook==True:
if buttons_YN == True:
g.show_buttons()
else:
g.width = "100%"
else:
if buttons_YN == True:
g.width = "70%"
g.show_buttons(["physics"])
else:
g.width = "100%"
# Get unique list of artists to create nodes
for artist, value, genre, color in zip(df_artists["Artist Name"],
df_artists["Degree Centrality"],
df_artists["Partition"],
df_artists["Partition Color"]):
if artist in set(pd.concat([df_edgelist["Artist_1"], df_edgelist["Artist_2"]])):
g.add_node(artist,
value=value,
title= f"Artist: {artist} | Partition: {genre}",
color=color,
partition=partition)
# Create edges with all artist pairs
if edge_width_YN == True:
for artist_1, artist_2, weight in zip(df_edgelist["Artist_1"],
df_edgelist["Artist_2"],
df_edgelist["Weight"]):
g.add_edge(artist_1, artist_2, weight=weight, width=weight, title=f"Weight: {weight}")
else:
for artist_1, artist_2, weight in zip(df_edgelist["Artist_1"],
df_edgelist["Artist_2"],
df_edgelist["Weight"]):
g.add_edge(artist_1, artist_2, weight=weight, title=f"Weight: {weight}")
# Set algorithm
if algo == "barnes":
g.barnes_hut()
elif algo == "forced":
g.force_atlas_2based()
elif algo == "hr":
g.hrepulsion()
return g.show(file_name)
create_pyvis_graph_partition(df_nodes, df_edgelist,
height="800px", width="100%", bgcolor="#FDFEFE", font_color="#1E1C1E", algo="barnes",
notebook=True, buttons_YN=False, edge_width_YN=True, file_name="partition_graph.html")
# Create the CircosPlot object: c
plt.rcParams['axes.facecolor'] = 'white'
c_partition = CircosPlot(G,
node_grouping="Partition",
node_order="Partition",
node_color="Partition",
edge_width="weight",
node_label_layout="rotation",
group_label_position="middle",
group_label_offset=2,
figsize=(15,15))
# Draw c to the screen
c_partition.draw()
# Display the plot
plt.show()
First, the groups generated by louvain algorithm are investigated and tried to make sense of by applying some EDA to it.
df_nodes["Partition"].value_counts()
Noticeable is that some groups are very small and cannot really be interpreted due to that.
def partition_composition(nodes, criterion, n=1):
"""
Gives out a dataframe which groups the different groups from the partition and gives out the top n shares
by the criterion of each group out.
"""
return pd.DataFrame(pd.DataFrame(nodes.groupby(["Partition"])[criterion].\
value_counts(normalize=True, sort=True,ascending=False)).\
groupby(level="Partition")[criterion].nlargest(n).reset_index(level=0, drop=True))
partition_composition(df_nodes, "Genre")
Interpeting the different groups generated by the lauvoin algorithm in the perspective of genres shows that it did not detect too many different genres and is mainly dominated by pop.
partition_composition(df_nodes, "Tracks Bins")
partition_composition(df_nodes, "Popularity Bins")
Not a single group was created that holds the very popular artists. This seems reasonable since it seems likely that they very connected and makes it more difficult for the algorithm to delimit. This hypothesis has be investigated further later on.
def artist_weight_grouping_top_n(nodes, criterion, n = 5):
"""
Gives out a dataframe with the top n weight of artists grouped by the criterion given in.
"""
return pd.DataFrame(pd.merge(nodes.loc[:,["Artist Name", criterion]],
pd.DataFrame(edges_count.groupby('Artists')["Weight"].sum().sort_values(ascending=False)),
left_on="Artist Name", right_index=True, how="left").\
groupby([criterion, "Artist Name"]).max().\
groupby(level=criterion)["Weight"].nlargest(n).reset_index(level=0, drop=True))
def artist_cent_grouping_top_n(nodes, criterion, cent_type, n=5):
"""
Gives out a dataframe with the top n artists based on a centrality type cent_type and grouping criterion
given into the functionh.
"""
return pd.DataFrame(pd.DataFrame(nodes.groupby([criterion, "Artist Name"])[cent_type].mean()).\
groupby(level=criterion)[cent_type].nlargest(n).reset_index(level=0, drop=True))
artist_weight_grouping_top_n(df_nodes, "Partition", 3)
artist_cent_grouping_top_n(df_nodes, "Partition", "Degree Centrality", 3)
artist_cent_grouping_top_n(df_nodes, "Partition", "Eigenvector Centrality", 3)
def dens_and_trans_calc(df_artists, df_edgelist, group_criterion):
table = PrettyTable(["Group","Density", "Transitivity"])
for criterion in df_artists[group_criterion].unique():
# since there are some artists without a Partition value assigned, they are not shown in the output table
# with the str and nan the possible cases are covered
if isinstance(criterion,str):
G_temp = generate_networkx(df_artists[df_artists[group_criterion] == criterion],
df_edgelist,
[group_criterion])
table.add_row([criterion, round(nx.density(G_temp),4), round(nx.transitivity(G_temp),4)])
elif math.isnan(criterion) == False:
G_temp = generate_networkx(df_artists[df_artists[group_criterion] == criterion],
df_edgelist,
[group_criterion])
table.add_row([criterion, round(nx.density(G_temp),4), round(nx.transitivity(G_temp),4)])
print(table)
dens_and_trans_calc(df_nodes, df_edgelist, "Partition")
Community 7 observed by itself is the strongest connected network. It has over a quarter of all possible connections and over half of all possible triangle connections.
print(nx.attribute_assortativity_coefficient(G, "Partition"))
The assortivity measure shows the similiartiy of connections in regard to the given attribute. With 0.47 this value is quite high, compared to the ones calculated later on, which is logical since this was the goal of the algorithm to detect similarity within communities. However, these generated communities are difficult for humans to interpret and to draw conclusions from.
def create_pyvis_graph_genre(df_artists, df_edgelist, height="1500px", width="100%", bgcolor="#222222",
font_color="#1E1C1E", algo="barnes", notebook=True,
buttons_YN=True, edge_width_YN=True, heading="", file_name="pyvis_graph.html"):
"""
Creates a pyvis visualization of a network colored by genre, edge width by weight and node size by
degree centrality.
Input:
- df_artists: dataframe holding the relevant artist data
- df_edgelist: dataframe holding the relevant edges
- height (str): defines height of the graph
- bgcolor (str): defines of the background (HTML color code can be used or simple color naming)
- fontcolor (str): defines the font color (HTML color code can be used or simple color naming)
- algo (str): the options "barnes", "forced" and "hr" are possible options for the algorithm to set up graph
- notebook (boolean): determines if graph is saved as seperate HTML file or shown within the notebook
- buttons_YN (boolean): adds a bar with design setting options on the side
- edge_with_YN (boolean): adjusts the width of the edges based on the weight if turned on
- heading (str): adds a header to the graph
Output:
pyvis graph
"""
# Set up network
g = Network(height=height, width=width,
bgcolor=bgcolor, font_color=font_color,
notebook=notebook, heading=heading)
# Set width of graph depending if setting should be shown on the side, only if not displayed in notebook
# since there results in issues
if notebook==True:
if buttons_YN == True:
g.show_buttons()
else:
g.width = "100%"
else:
if buttons_YN == True:
g.width = "70%"
g.show_buttons(["physics"])
else:
g.width = "100%"
# Get unique list of artists to create nodes
for artist, value, genre, color in zip(df_artists["Artist Name"],
df_artists["Degree Centrality"],
df_artists["Genre"],
df_artists["Genre Color"]):
if artist in set(pd.concat([df_edgelist["Artist_1"], df_edgelist["Artist_2"]])):
g.add_node(artist,
value=value,
title= f"Artist: {artist} | Genre: {genre}",
color=color,
partition=partition)
# Create edges with all artist pairs
if edge_width_YN == True:
for artist_1, artist_2, weight in zip(df_edgelist["Artist_1"],
df_edgelist["Artist_2"],
df_edgelist["Weight"]):
g.add_edge(artist_1, artist_2, weight=weight, width=weight, title=f"Weight: {weight}")
else:
for artist_1, artist_2, weight in zip(df_edgelist["Artist_1"],
df_edgelist["Artist_2"],
df_edgelist["Weight"]):
g.add_edge(artist_1, artist_2, weight=weight, title=f"Weight: {weight}")
# Set algorithm
if algo == "barnes":
g.barnes_hut()
elif algo == "forced":
g.force_atlas_2based()
elif algo == "hr":
g.hrepulsion()
return g.show(file_name)
create_pyvis_graph_genre(df_nodes, df_edgelist, height="800px", width="100%", bgcolor="#FDFEFE", font_color="#1E1C1E", algo="barnes",
notebook=True, buttons_YN=False, edge_width_YN=True, file_name="genre_graph.html")
Latin and german hip hop is highly connected with itself, so they prefer collaborationg with people within their genre most. Pop looks like the biggest genre in within the network, with quite some important artist as main collaborators, also for artists of other genres. However, the most connected network seems to be pop rap with many connections to artists of other genres, but also very stronlgy connected in itself. Also their high rate of people with artists with high degree centrality scores is noticable, such as Wiz Khalifa, Lil Wayne, Big Sean and Gucci Mane.
Filtering further down by amount of connections to make it less messy $\rightarrow$ five or more collaborations
len(df_edgelist[df_edgelist["Weight"]>4])
create_pyvis_graph_genre(df_nodes, df_edgelist[df_edgelist["Weight"]>4],
height="800px", width="100%", bgcolor="#FDFEFE", font_color="#1E1C1E", algo="barnes",
notebook=True, buttons_YN=False, edge_width_YN=True, file_name="genre_graph_five_plus.html")
Pop rap, latin and german hip hop still pop out as strongly connected in this network. For german hip hop the heavy collaborations between RAF Camora and Bonez Mc + KC Rebell and Summer Cem are notiable. In the latin genre the interconnectedness of the four artists Sech, Dalex, Justin Quiles and Lenny Tavarez is special.
Filtering further down by amount of connections to make it less messy $\rightarrow$ ten or more collaborations
len(df_edgelist[df_edgelist["Weight"]>9])
create_pyvis_graph_genre(df_nodes, df_edgelist[df_edgelist["Weight"]>9],
height="800px", width="100%", bgcolor="#FDFEFE", font_color="#1E1C1E", algo="barnes",
notebook=True, buttons_YN=False, edge_width_YN=True, file_name="genre_graph_ten_plus.html")
The heaviest collaborators in the overall network can be noticed well in this last pyvis visualization. Gucci Mane, Rick Ross and Lil Wayne are highly connected artists. DJ Khaled, who falls under the pop genre, holds strong connections to man of the top pop rap artists.
# Create the CircosPlot object: c
plt.rcParams['axes.facecolor'] = 'white'
c_genre = CircosPlot(G,
node_grouping="Genre",
node_order="Genre",
node_color="Genre",
edge_width="weight",
node_label_layout="rotation",
group_label_position="middle",
group_label_offset=2,
figsize=(15,15))
# Draw c to the screen
c_genre.draw()
# Display the plot
plt.show()
The circos plot shows as the interactive pyvis visualization before the strong collaboration of artists of pop rap and rap, but also between these these two. However, the circos plot is very handy to also analyse the groups with very little collaborations. Contemporary country, alternative metal, rock and modern rock stand out. Since in the genre rock often the artists are not individuals, but bands, this comes as no suprise. It makes collaborations with even more people very difficult. For contemporary country a possible explanation is that it is a very specific genre that does not work to well with other mainstream genres. Pop on the other works quite well for collaborations, especially with pop rap. Here however, only some of the huge group of artists grew into popular cross-genre collaboratators. As the next graph shows among them are Christ Brown, Nicki Minaj DJ Khaled or T-Pain.
In the following the most influencial and collaborative artists can be observed individually. The same names as mentioned before clearly stand out.
# Create the CircosPlot object: c
plt.rcParams['axes.facecolor'] = 'white'
c_genre = CircosPlot(G_five_plus,
node_grouping="Genre",
node_color="Genre",
node_labels=True,
node_label_layout="rotation",
edge_width="weight",
group_label_position="middle",
group_label_offset=14,
figsize=(15,15))
c_genre.draw_group_labels()
# Draw c to the screen
c_genre.draw()
# Display the plot
plt.show()
# Create the CircosPlot object: c
plt.rcParams['axes.facecolor'] = 'white'
c_genre = CircosPlot(G_ten_plus,
node_grouping="Genre",
node_color="Genre",
node_labels=True,
node_label_layout="rotation",
edge_width="weight",
group_label_position="middle",
group_label_offset=10,
figsize=(15,15))
c_genre.draw_group_labels()
# Draw c to the screen
c_genre.draw()
# Display the plot
plt.show()
The table shows the highest connected artists based on the weight for each genre.
artist_weight_grouping_top_n(df_nodes, "Genre", 3)
The table shows the highest connected artists based on the degree centrality for each genre.
artist_cent_grouping_top_n(df_nodes, "Genre", "Degree Centrality", 3)
The table shows the highest connected artists based on the eigenvector centrality for each genre.
artist_cent_grouping_top_n(df_nodes, "Genre", "Eigenvector Centrality", 3)
dens_and_trans_calc(df_nodes, df_edgelist, "Genre")
print(nx.attribute_assortativity_coefficient(G, "Genre"))
Artists have a much higher tendency to collaborate with artists within their genre than a random genre.
The following circos plot shows the artists ranked starting from the left going around anticlockwise.
# Create the CircosPlot object: c
plt.rcParams['axes.facecolor'] = 'white'
c_rank = CircosPlot(G,
node_order="Rank",
node_labels=True,
node_label_layout="rotation",
edge_width="weight",
figsize=(20,20),
fontsize=8)
# Draw c to the screen
c_rank.figure.tight_layout()
c_rank.draw()
# Display the plot
plt.show()
The graph shows that the artists ranked higher generally have stronger network and produce more tracks in collaboration than the lower rank artists.
# Create the CircosPlot object: c
plt.rcParams['axes.facecolor'] = 'white'
c_popularity = CircosPlot(G,
node_grouping="Popularity Bins",
node_color="Popularity Bins",
node_labels=False,
node_label_layout="rotation",
edge_width="weight",
group_label_position="middle",
group_label_offset=10,
figsize=(15,15))
# Draw c to the screen
c_popularity.figure.tight_layout()
c_popularity.draw_group_labels()
c_popularity.draw()
# Display the plot
plt.show()
Since the higher ranked artists are also the more popular ones the graphs show very similar results. If artists are more popular they tend to have more collaborations and seem to be also to be more with artists which move in the some area of popularity.
# Create the CircosPlot object: c
plt.rcParams['axes.facecolor'] = 'white'
c_tracks = CircosPlot(G,
node_grouping="Tracks Bins",
node_color="Tracks Bins",
node_labels=False,
node_label_layout="rotation",
edge_width="weight",
group_label_position="middle",
group_label_offset=10,
figsize=(15,15))
# Draw c to the screen
c_tracks.figure.tight_layout()
c_tracks.draw()
# Display the plot
plt.show()
The last graph supports what common sense would suggest. People who produce overall more songs also produce in absolute number more songs in collaborations. More experienced artists seem to chose to collaborate with artists who also have already produced many songs.
artist_weight_grouping_top_n(df_nodes, "Tracks Bins", 3)
artist_weight_grouping_top_n(df_nodes, "Popularity Bins", 3)
artist_cent_grouping_top_n(df_nodes, "Popularity Bins", "Degree Centrality", 3)
print(nx.attribute_assortativity_coefficient(G, "Popularity Bins"))
The number supports the visual observation that artists within the same range of popularity have a slightly higher tendency to collaborate.
print(nx.attribute_assortativity_coefficient(G, "Tracks Bins"))
The number supports the visual observation that artists within the same range of overall produced songs have a slightly higher tendency to collaborate.
Bringing all this together: We want a function which gives us the hottes artists of a genre based on the preferred collabroation measure. By having them figured out in the next step we try to figure out how we can reach these artists.
def top_n_artists_in_genre(df, genre, collab_measure, n):
"""
The function gives out the top n artists by the desired genre and overall collaboration measure.
Input:
- df (dataframe): artist dataframe
- genre (str): the genre to filter on
- collab_measure (str): the desired collaboration measure of an artist ("Eigenvector Centrality",
"Degree Centrality" or "Weight")
Output:
- df (dataframe): as determined on top
"""
return pd.DataFrame(df[df["Genre"] == genre].loc[:, ["Artist Name", collab_measure]].\
sort_values(by=[collab_measure], ascending=False))[:n]
top_n_artists_in_genre(df_nodes, "german hip hop", "Degree Centrality", 5)
Scenario 1: If we know which artist we want to get our hands on we possible already know someone in the music industry. In this case we want to see over which chain we can reach out to our desired artist for collaboration. The assumption is that there is a higher chance to know someone when they are less famous. Second, if the connection between the artists is stronger it promotes the probability to be helpful.
def get_in_contact_with_connection(G, df_artist, df_edgelist, source, target, n):
"""
Gives out a possible chains of artists how to reach artist for possible collaboration with
one artist know as starting point. The sum of the ranks of all artists is calculated
and returned in an descending order.
Input:
- G (network): network to analyze
- df_artist (dataframe): dataframe that holds the information of the artists
- source (str): artist that is known
- target (str): artist that want to be reached out to
- n (int): amount of chains shown
Output:
- df (dataframe): holds n chains of connections to reach the desired artist.
"""
# calculate all shortest paths from source to target
df = pd.DataFrame(nx.all_shortest_paths(G, source, target))
# lists to place sums of rank and weight of each chain in
sum_list_rank = []
sum_list_weight = []
for index, row in df.iterrows():
# lists to place temporary values of rank and weight for each chain in, to calculate the sum at the end
temp_list_rank = []
temp_list_weight = []
for artist_iter in range(len(row)-1):
# create df subset with all rows where first artist is in target or source
subset = df_edgelist[(df_edgelist["Artist_1"] == row[artist_iter]) |\
(df_edgelist["Artist_2"] == row[artist_iter])]
# get the row of the pair by filtering down to one row from the subset by searching for second artist
temp_list_weight.append(subset[(subset["Artist_1"] == row[artist_iter+1]) |\
(subset["Artist_2"] == row[artist_iter+1])].iloc[0]["Weight"])
# take weight and add to list
sum_list_weight.append(sum(temp_list_weight))
# same idea of calculating the sum, but simpler loop since only one artist name required
for artist in row:
temp_list_rank.append(df_artists[df_artists["Artist Name"] == artist].iloc[0]["Rank"])
sum_list_rank.append(sum(temp_list_rank))
# adding the weight and rank to the df
df["Sorting Criterion Weight Sum"] = sum_list_weight
df["Sorting Criterion Rank Sum"] = sum_list_rank
# sorting the df by rank
df.sort_values(by=["Sorting Criterion Rank Sum"], ascending=False, inplace=True)
df.reset_index(inplace=True, drop=True)
# return the n first rows of the df
return df[:n]
get_in_contact_with_connection(G, df_nodes, df_edgelist, "Kollegah", "Ufo361", 5)
Scenario 2: We know which artist we want to get our hands on and don't know anyone music industry. In this case we determine how many degrees away we are willing to start to get in contact with the artist and then to see over which chains we can reach out to our desired artist for collaboration. The assumption is that there is a higher chance to know someone when they are less famous. Second, if the connection between the artists is stronger it promotes the probability to be helpful.
def get_in_contact_without_connection(G, df_artists, df_edgelist, target, degree, n):
"""
Gives out a possible chains of artists how to reach artist for possible collaboration with no artist know
as starting point. A degree is given in to determine how far away we are willing to start.
The sum of the ranks of all artists is calculated and returned in an descending order.
Input:
- G (network): network to analyze
- df_artist (dataframe): dataframe that holds the information of the artists
- target (str): artist that want to be reached out to
- degree (int): how many degrees away are we willing to start reaching out
- n (int): amount of chains shown
Output:
- df (dataframe): holds n chains of connections to reach the desired artist.
"""
degree_artists = [key for (key, value) in nx.single_source_shortest_path_length(G, target).items()\
if value == degree]
# create empty df to fill with all possible connections
df = pd.DataFrame(nx.all_shortest_paths(G, degree_artists[0], target))
df = df[:0]
for i in degree_artists:
df_temp = pd.DataFrame(nx.all_shortest_paths(G, i, target))
df = df.append(df_temp)
# lists to place sums of rank and weight of each chain in
sum_list_rank = []
sum_list_weight = []
for index, row in df.iterrows():
# lists to place temporary values of rank and weight for each chain in, to calculate the sum at the end
temp_list_rank = []
temp_list_weight = []
for artist_iter in range(len(row)-1):
# create df subset with all rows where first artist is in target or source
subset = df_edgelist[(df_edgelist["Artist_1"] == row[artist_iter]) |\
(df_edgelist["Artist_2"] == row[artist_iter])]
# get the row of the pair by filtering down to one row from the subset by searching for second artist
temp_list_weight.append(subset[(subset["Artist_1"] == row[artist_iter+1]) |\
(subset["Artist_2"] == row[artist_iter+1])].iloc[0]["Weight"])
# take weight and add to list
sum_list_weight.append(sum(temp_list_weight))
# same idea of calculating the sum, but simpler loop since only one artist name required
for artist in row:
temp_list_rank.append(df_artists[df_artists["Artist Name"] == artist].iloc[0]["Rank"])
sum_list_rank.append(sum(temp_list_rank))
# adding the weight and rank to the df
df["Sorting Criterion Weight Sum"] = sum_list_weight
df["Sorting Criterion Rank Sum"] = sum_list_rank
# sorting the df by rank
df.sort_values(by=["Sorting Criterion Rank Sum"], ascending=False, inplace=True)
df.reset_index(inplace=True, drop=True)
# return the n first rows of the df
return df[:n]
get_in_contact_without_connection(G, df_nodes, df_edgelist, "Grateful Dead", 6, 5)
# Load the data
df_lyrics = pd.read_csv(PATH_LYRICS_CLEAN, index_col=0, converters={"Lyrics Clean Tok": ast.literal_eval})
df_lyrics.head()
Let' start understanding the song lyrics by looking at the most frequently ocuring words.
def plot_top_n_words(df_column,n=15):
"""
Creates a barplot of top n words.
Parameters:
df_column (pandas series): Series consisting of tokenized texts
n (int): Number of words that should be displayed
"""
# Get number of occurences of each word
word_count = Counter([word for words in df_column for word in words])
# Get most common words and store words and their counts in lists
words,counts = map(list,zip(*word_count.most_common(n)))
plt.figure(figsize=(20,10))
plt.xticks(fontsize=15)
plt.yticks(fontsize=15)
plt.barh(y=words, width=counts)
plt.gca().invert_yaxis()
plt.title(f"Top {n} most common words",fontsize=20)
plt.show()
plot_top_n_words(df_lyrics["Lyrics Clean Tok"],20)
Do different genres use different words? Let's have a look.
plot_top_n_words(df_lyrics[df_lyrics["Genre"] == "pop"]["Lyrics Clean Tok"],15)
plot_top_n_words(df_lyrics[df_lyrics["Genre"] == "rock"]["Lyrics Clean Tok"],15)
plot_top_n_words(df_lyrics[df_lyrics["Genre"] == "pop rap"]["Lyrics Clean Tok"],15)
plot_top_n_words(df_lyrics[df_lyrics["Genre"] == "rap"]["Lyrics Clean Tok"],15)
plot_top_n_words(df_lyrics[df_lyrics["Genre"] == "pop dance"]["Lyrics Clean Tok"],15)
plot_top_n_words(df_lyrics[df_lyrics["Genre"] == "country"]["Lyrics Clean Tok"],15)
plot_top_n_words(df_lyrics[df_lyrics["Genre"] == "alternative metal"]["Lyrics Clean Tok"],15)
plot_top_n_words(df_lyrics[df_lyrics["Genre"] == "latin"]["Lyrics Clean Tok"],15)
While some observations can already be made (e.g. words like nigga, bitch, fuck, shit predominantly appear in the rap category), the words distributions across the other genres look pretty much alike. We will return to this later to find a better way of determining the genres based on the wording
Apart from the content, also the word count of the songs can be analysed. To do so, let's first create a new column that stores the number of words of each song. Let's first check the distribution of song length.
# Count the number of tokens for each lyric
df_lyrics["Word Count"] = df_lyrics["Lyrics Clean Tok"].apply(lambda x: len(x))
# Count the number of distinct tokens for each lyric
df_lyrics["Distinct Word Count"] = df_lyrics["Lyrics Clean Tok"].apply(lambda x: len(set(x)))
# Get ratio of distinct words over all words in lyric
df_lyrics["Word Variability"] = df_lyrics["Distinct Word Count"] / df_lyrics["Word Count"]
df_lyrics.head()
# Plot track length
sns.displot(df_lyrics["Word Count"])
The song length seems to be quite normally distributed. However, it's difficult to see as there seems to be a strong outlier at around 12,000 words. Let's identify this song.
df_lyrics[df_lyrics["Word Count"] == max(df_lyrics["Word Count"])]
When this outlier is removed, the data is still right-skewed as there are a few more songs outside the expected distribution.
sns.displot(df_lyrics[df_lyrics["Word Count"] < max(df_lyrics["Word Count"])]["Word Count"])
How do genres differ with regards to their song length?
# Group genre by their mean, median, and max of word count and their total number of occurences
grouping_wc = df_lyrics.groupby("Genre", sort=True).agg({'Word Count': ['mean', 'median', 'max',"count"]}).reset_index()
grouping_wc
Rappers clearly produce the longest songs. Even though the mean might be skewed by the extremely long song mentioned above, the median confirms that rap songs tend to be longer.
# Group genre by their mean, median, and max of distinct word count and their total number of occurences
grouping_dwc = df_lyrics.groupby("Genre", sort=True).agg({'Distinct Word Count': ['mean', 'median', 'max',"count"]}).reset_index()
grouping_dwc
# Group genre by their mean, median, and max of word variability and their total number of occurences
grouping_wv = df_lyrics.groupby("Genre", sort=True).agg({'Word Variability': ['mean', 'median', 'max',"count"]}).reset_index()
grouping_wv
# Look at lyrics that have variability less than 0.05 (i.e. that are extremely repetitive)
df_lyrics[df_lyrics["Word Variability"] < 0.05].head()
Let us now check whether it is possible to predict the genre of a song based on its lyrics. This is done using a logistic regression.
def plot_result_analysis(y_test, y_pred, class_labels):
print(classification_report(y_test,y_pred,zero_division=False))
conf_mat = confusion_matrix(y_test, y_pred)
fig, ax = plt.subplots(figsize=(10,10))
sns.heatmap(conf_mat, annot=True, fmt='d', cmap="YlGnBu", xticklabels=class_labels, yticklabels=class_labels)
plt.ylabel('Actual')
plt.xlabel('Predicted')
plt.show()
X = df_lyrics["Lyrics Clean No Tok"]
y = df_lyrics["Genre"]
X_train,X_test,y_train,y_test = train_test_split(X, y, train_size = 0.75, stratify = y, random_state=RANDOM)
# Simple Pipe
pipeline = Pipeline([
('vect', TfidfVectorizer()),
('class', LogisticRegression(max_iter=1000,random_state=RANDOM)),
])
y_pred = pipeline.fit(X_train,y_train).predict(X_test)
plot_result_analysis(y_test,y_pred,pipeline.classes_)
eli5.show_weights(pipeline.named_steps["class"], vec=pipeline.named_steps["vect"], top=30)
Just based on the logistic regression in its default configuration, several things already become appearant.
Based on these observations, the following steps are taken to improve the model:
# Drop Latin
df_lyrics = df_lyrics[df_lyrics["Genre"] != "latin"].reset_index()
# Drop Alternative Metal
df_lyrics = df_lyrics[df_lyrics["Genre"] != "alternative metal"].reset_index()
# Rename hip hop to rap
df_lyrics['Genre'] = df_lyrics['Genre'].replace(['pop rap'],'rap')
df_lyrics["Genre"].value_counts()
X = df_lyrics["Lyrics Clean No Tok"]
y = df_lyrics["Genre"]
X_train,X_test,y_train,y_test = train_test_split(X, y, train_size = 0.75, stratify = y, random_state=RANDOM)
pipeline = Pipeline([
('vect', TfidfVectorizer()),
('class', LogisticRegression(max_iter=1000,random_state=RANDOM)),
])
y_pred = pipeline.fit(X_train,y_train).predict(X_test)
plot_result_analysis(y_test,y_pred,pipeline.classes_)
eli5.show_weights(pipeline.named_steps["class"], vec=pipeline.named_steps["vect"], top=30)
X = df_lyrics["Lyrics Clean No Tok"]
y = df_lyrics["Genre"]
X_train,X_test,y_train,y_test = train_test_split(X, y, train_size = 0.75, stratify = y, random_state=RANDOM)
# Pipe with GridSearch LogReg (takes quite long to run)
X = df_lyrics["Lyrics Clean No Tok"]
y = df_lyrics["Genre"]
# Split dataset into train and test data
X_train,X_test,y_train,y_test = train_test_split(X,y,train_size=0.75,stratify=y,random_state=RANDOM)
# Define paramters for grid search
parameters = {"vect__max_df": [0.1, 0.3, 1.0],
"vect__max_features":[10000, 50000, None],
"class__C":[0.1, 1, 5, 10]}
# Define Pipeline Steps
pipeline = Pipeline([('vect', TfidfVectorizer()),
('class', LogisticRegression(max_iter=1000,
random_state=RANDOM))])
# Train the classifier through cross-validated grid search
clf = GridSearchCV(pipeline,
parameters,
cv=3,
scoring = "precision_macro",
n_jobs = -1,
verbose = 10)
clf.fit(X_train,y_train)
# Predict y-values using the classifier
y_pred = clf.predict(X_test)
# Calculate accuracy of the model
score = accuracy_score(y_test,y_pred)
print("Accuracy Score:",score)
# Create and print confusion matrix and classification report
cm = confusion_matrix(y_test,y_pred)
print(cm)
print(classification_report(y_test,y_pred,zero_division=False))
clf.best_params_
pipeline = Pipeline([
('vect', TfidfVectorizer(max_df=0.1,max_features=50000)),
('class', LogisticRegression(max_iter=1000,random_state=RANDOM, C=1)),
])
pipeline.fit(X_train,y_train)
y_pred = pipeline.predict(X_test)
plot_result_analysis(y_test,y_pred,pipeline.classes_)
eli5.show_weights(pipeline.named_steps["class"], vec=pipeline.named_steps["vect"], top=30)
X = df_lyrics["Lyrics Clean No Tok"]
y = df_lyrics["Genre"]
tfidf = TfidfVectorizer(max_features=50000)
X_vect = tfidf.fit_transform(X)
X_train,X_test,y_train,y_test = train_test_split(X_vect, y, train_size = 0.75, stratify = y, random_state=RANDOM)
y_train.value_counts()
# Instantiate random oversampler
oversample = RandomOverSampler(random_state=RANDOM)
# Oversample the train data
X_train_over, y_train_over = oversample.fit_sample(X_train, y_train)
y_train_over.value_counts()
# Define paramters for grid search
parameters = {"class__C":[0.1, 1, 5, 10]}
# Define Pipeline Steps
pipeline = Pipeline([('class', LogisticRegression(max_iter=1000, random_state=RANDOM))])
# Train the classifier through cross-validated grid search
clf = GridSearchCV(pipeline,
parameters,
cv=3,
scoring = "precision_macro",
n_jobs = -1,
verbose = 10)
clf.fit(X_train_over,y_train_over)
y_pred = clf.predict(X_test)
clf.best_params_
pipeline = Pipeline([
('class', LogisticRegression(max_iter=1000,random_state=RANDOM, C=10)),
])
pipeline.fit(X_train_over,y_train_over)
y_pred = pipeline.predict(X_test)
plot_result_analysis(y_test,y_pred,pipeline.classes_)
eli5.show_weights(pipeline.named_steps["class"], vec=tfidf, top=30)
word_dict = Dictionary(df_lyrics["Lyrics Clean Tok"])
print(len(word_dict))
word_count = Counter([word for words in df_lyrics["Lyrics Clean Tok"] for word in words])
word_count_values = list(word_count.values())
print(f"Total amount of unique words: {len(word_count)}")
print(f"Total amount of unique words that appear more than 5000 times: {len([i for i in word_count_values if i > 5000])}")
print(f"Total amount of unique words that appear more than 2000 times: {len([i for i in word_count_values if i > 2000])}")
print(f"Total amount of unique words that appear more than 1000 times: {len([i for i in word_count_values if i > 1000])}")
print(f"Total amount of unique words that appear more than 10 times: {len([i for i in word_count_values if i > 10])}")
print(f"Total amount of unique words that appear less than 5 times: {len([i for i in word_count_values if i < 5])}")
print(f"Total amount of unique words that appear 1 time: {len([i for i in word_count_values if i <= 2])}")
perc_25 = np.quantile(word_count_values,0.25)
perc_50 = np.quantile(word_count_values,0.5)
perc_75 = np.quantile(word_count_values,0.75)
perc_99 = np.quantile(word_count_values,0.99)
perc_995 = np.quantile(word_count_values,0.995)
perc_999 = np.quantile(word_count_values,0.999)
print(f"\n\n25% Percentile at {perc_25}.\nTotal amount of unique words that appear more than this percentile: {len([i for i in word_count_values if i > perc_25])}\n\n")
print(f"50% Percentile at {perc_50}.\nTotal amount of unique words that appear more than this percentile: {len([i for i in word_count_values if i > perc_50])}\n\n")
print(f"75% Percentile at {perc_75}.\nTotal amount of unique words that appear more than this percentile: {len([i for i in word_count_values if i > perc_75])}\n\n")
print(f"99% Percentile at {perc_99}.\nTotal amount of unique words that appear more than this percentile: {len([i for i in word_count_values if i > perc_99])}\n\n")
print(f"99.5% Percentile at {perc_995}.\nTotal amount of unique words that appear more than this percentile: {len([i for i in word_count_values if i > perc_995])}\n\n")
print(f"99.9% Percentile at {perc_999}.\nTotal amount of unique words that appear more than this percentile: {len([i for i in word_count_values if i > perc_999])}")
too_frequent_words = [word for word, count in word_count.items() if count > perc_999]
too_unfrequent_words = [word for word, count in word_count.items() if count < 10]
manual_remove = ("something someone everything everybody nothing somebody really nobody okay woah whoa another cause take never "
"back want ever forever last first well without enough world life better alright look give live think would still hand "
"nigga bitch fuck fuckin good shit little keep").split(" ")
too_frequent_words_ids = [word_dict.token2id[word] for word in too_frequent_words]
too_unfrequent_words_ids = [word_dict.token2id[word] for word in too_unfrequent_words]
manual_remove_ids = [word_dict.token2id[word] for word in manual_remove]
words_to_delete = manual_remove_ids + too_unfrequent_words_ids + too_frequent_words_ids
word_dict.filter_tokens(bad_ids=words_to_delete)
print(len(word_dict))
corpus_track = [word_dict.doc2bow(doc) for doc in df_lyrics["Lyrics Clean Tok"]]
# Create dictionary for perplexity scores
perplexity_scores = dict()
# Create dictionary for coherence scores
coherence_scores = dict()
# Iterate over 2 to 15 topics (takes quite long to execute)
for i in tqdm(range(2,16)):
# Create an LDA Model
lda_model = LdaModel(corpus_track,
id2word=word_dict,
num_topics=i,
random_state=RANDOM,
passes = 3,
alpha="auto")
# Calculate perplexity score and append it to dict
perplexity = lda_model.log_perplexity(corpus_track)
perplexity_scores[i] = perplexity
# Calculate coherence score and append it to dict
coherence_model_lda = CoherenceModel(model=lda_model,
texts=df_lyrics["Lyrics Clean Tok"],
dictionary=word_dict,
coherence='c_v')
coherence_scores[i] = coherence_model_lda.get_coherence()
# Plot perplexity scores
plt.plot(perplexity_scores.keys(), perplexity_scores.values())
plt.xlabel("Number of Topics")
plt.ylabel("Perplexity")
plt.title("Perplexity Scores")
plt.show()
# Plot Coherence scores
plt.plot(coherence_scores.keys(), coherence_scores.values())
plt.xlabel("Number of Topics")
plt.ylabel("Coherence")
plt.title("Coherence Scores")
plt.show()
lda_model_track_3 = LdaModel(corpus_track,
id2word=word_dict,
num_topics=3,
random_state=RANDOM,
passes = 3,
alpha="auto")
lda_model_track_3.print_topics(-1)
lda_display_3 = pyLDAvis.gensim.prepare(lda_model_track_3, corpus_track, word_dict)
pyLDAvis.display(lda_display_3)
lda_model_track_6 = LdaModel(corpus_track,
id2word=word_dict,
num_topics=6,
random_state=RANDOM,
passes = 3,
alpha="auto")
lda_model_track_6.print_topics(-1)
lda_display_6 = pyLDAvis.gensim.prepare(lda_model_track_6, corpus_track, word_dict)
pyLDAvis.display(lda_display_6)
lda_model_track_13 = LdaModel(corpus_track,
id2word=word_dict,
num_topics=13,
random_state=RANDOM,
passes = 3,
alpha="auto")
lda_model_track_13.print_topics(-1)
lda_display_13 = pyLDAvis.gensim.prepare(lda_model_track_13, corpus_track, word_dict)
pyLDAvis.display(lda_display_13)
Based on the visulaisations of the different topic configurations, the LDA model with 6 topics was determined to be the most suitable one. For that reason, each lyric is now assigned the topic that it belongs to the most.
def assign_topic(values):
"""
Function takes list of topic,value pairings as input and returns the topic with the highest value, given that
this value is larger than 0.25
"""
max_value = 0.25
max_topic = None
for topic, value in values:
if value > max_value:
max_value = value
max_topic = topic
else:
continue
return max_topic
# Iterate over all tracks from corpus and create list with the assigned topic of each track
topics = []
for i in range(len(corpus_track)):
topics.append(assign_topic(lda_model_track_6[corpus_track][i]))
# Add topic column to the dataframe
df_lyrics["Topic"] = topics
# Count number of unclassified topics
df_lyrics["Topic"].isna().sum()
df_lyrics["Topic"].value_counts()
# Assign labels to the topics
df_lyrics["Topic"] = df_lyrics["Topic"].map({0: "Feelings",
1: "Party",
2: "Relationship",
3: "Gangstar",
4: "Rebel",
5: "Rage"})
pd.crosstab(df_lyrics["Genre"], df_lyrics["Topic"])
pd.crosstab(df_lyrics["Genre"], df_lyrics["Topic"]).apply(lambda r: round(r/r.sum(),2), axis=1)
pd.crosstab(df_lyrics["Genre"], df_lyrics["Topic"]).apply(lambda r: round(r/r.sum(),2), axis=0)
def get_top_n_indices(iterable,n):
"""
Returns indices of highest values in descending order.
Parameters:
iterable (array): Cosine similarity values of the respective song
n (integer): Indicates how many indices should be returned
Returns:
list of indices of top n similarity values
"""
# Get top n indices from input row
top_n_values = np.partition(iterable,-n)[-n:]
top_n_indices = np.argpartition(iterable,-n)[-n:]
# Put indices in descending order (hence the negative iterable) and return the result
top_n_values_sorted = sorted(top_n_values, reverse=True)
top_n_indices_sorted = top_n_indices[np.argsort(-iterable[top_n_indices])]
return top_n_values_sorted, top_n_indices_sorted
def find_most_similar_text(title, top_n):
"""
Prints out table of top n songs that are most similar to input song by retrieving values from a song similarity matrix created
outside this function.
Parameters:
title (string): Title of the track for which similar tracks should be found
top_n (int): Number of similar songs that should be displayed
"""
tab = PrettyTable(["Position","Index","Title","Artist","Score","Genre","Topic"])
# Check if input is string
if type(title) is str:
# Find index for song title (if title appears several times, first appearance is chosen)
index = df_lyrics.index[df_lyrics["Title"] == title][0]
#print(index)
else:
index = title
# Find dataframe row with index from input
query_data = df_lyrics.iloc[index,:]
# Extract Artist and Title from dataframe
query_artist = query_data["Artist"]
query_title = query_data["Title"]
query_genre = query_data["Genre"]
query_topic = query_data["Topic"]
# Find lyrics with highest similarity
top_values, top_indices = get_top_n_indices(song_similarity[index,:],top_n)
# Get information about similar songs and append them to table
for ctr, (val, ind) in enumerate(zip(top_values,top_indices)):
artist = df_lyrics.iloc[ind,:]["Artist"]
title = df_lyrics.iloc[ind,:]["Title"]
genre = df_lyrics.iloc[ind,:]["Genre"]
topic = df_lyrics.iloc[ind,:]["Topic"]
tab.add_row([ctr+1,ind,title,artist,round(val,3), genre, topic])
# Display similarity table
print(f'Most similar lyrics to "{query_title}" by {query_artist} (Index = {index}, Genre = {query_genre}, Topic = {query_topic}).\n')
print(tab)
def compare_lyrics(*ids):
"""
Shows the lyrics for an arbitrary selection of songs.
Parameters:
*ids (comma-separated ints): Indices of the songs that should be compared
"""
# Iterate over input ids
for ID in ids:
# Retrieve row from dataframe and corresponding column values
df = df_lyrics.iloc[ID,:]
artist = df["Artist"]
title = df["Title"]
lyrics = df["Lyrics"]
# Print song lyrics
print(f'"{title}" by {artist} (ID = {ID})\n\n{lyrics}\n\n\n')
# Create tfidf representation of lyrics
vectorizer = TfidfVectorizer()
track_lyrics_tfidf = vectorizer.fit_transform(df_lyrics["Lyrics Clean No Tok"])
track_lyrics_tfidf.shape
# Create song similarity matrix basd on cosine similarity of lyrics
song_similarity = cosine_similarity(track_lyrics_tfidf)
# Set identical song's similarity to zero
np.fill_diagonal(song_similarity,0)
song_similarity.shape
find_most_similar_text("Wish You Were Here",10)
compare_lyrics(3157,3534)
def find_most_similar_artist(artist, top_n):
tab = PrettyTable(["Position","Index","Artist","Score","Genre"])
# Get artist index
artist_index = artist_lyrics_complete.index[artist_lyrics_complete["Artist"] == artist][0]
artist_genre = artist_lyrics_complete[artist_lyrics_complete["Artist"] == artist]["Genre"].values[0]
# Find artist with highest similarity
top_values, top_indices = get_top_n_indices(artist_similarity_matrix[artist_index,:],top_n)
# Get information about similar artists and append them to table
for ctr, (val, ind) in enumerate(zip(top_values,top_indices)):
artist_comp = artist_lyrics_complete.iloc[ind,:]["Artist"]
genre_comp = artist_lyrics_complete.iloc[ind,:]["Genre"]
tab.add_row([ctr+1,ind,artist_comp,round(val,3),genre_comp])
# Display similarity table
print(f'Most similar artist to "{artist}" (Index = {artist_index}, Genre = {artist_genre}).\n')
print(tab)
# Create dataframe that contains artists and all their available lyrics
artist_lyrics_complete = df_lyrics.groupby(["Artist"],sort=False)["Lyrics Clean No Tok"].apply(lambda x: " ".join(x)).reset_index()
artist_lyrics_complete["Lyrics Clean Tok"] = df_lyrics.groupby(["Artist"],sort=False)["Lyrics Clean Tok"].apply(lambda x: list(x)).reset_index()["Lyrics Clean Tok"]
artist_lyrics_complete["Lyrics Clean Tok"] = artist_lyrics_complete["Lyrics Clean Tok"].apply(lambda x: [word for l in x for word in l])
artist_lyrics_complete = artist_lyrics_complete.merge(df_lyrics[["Artist","Genre"]], left_on="Artist",right_on="Artist").drop_duplicates("Artist").reset_index(drop=True)
artist_lyrics_complete.tail()
# Create tfidf representation of artist lyrics
vectorizer = TfidfVectorizer(ngram_range=(1,3), max_features=30000)
artist_lyrics_tfidf = vectorizer.fit_transform(artist_lyrics_complete["Lyrics Clean No Tok"])
artist_lyrics_tfidf.shape
artist_similarity_matrix = cosine_similarity(artist_lyrics_tfidf)
np.fill_diagonal(artist_similarity_matrix,0)
artist_similarity_matrix.shape
find_most_similar_artist("Blake Shelton",10)